acoular 24.3__py3-none-any.whl → 24.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acoular/__init__.py +119 -54
- acoular/calib.py +29 -38
- acoular/configuration.py +132 -82
- acoular/demo/__init__.py +10 -4
- acoular/demo/acoular_demo.py +73 -55
- acoular/environments.py +270 -264
- acoular/fastFuncs.py +366 -196
- acoular/fbeamform.py +1797 -1934
- acoular/grids.py +504 -548
- acoular/h5cache.py +74 -83
- acoular/h5files.py +159 -142
- acoular/internal.py +13 -14
- acoular/microphones.py +57 -53
- acoular/sdinput.py +57 -53
- acoular/signals.py +180 -178
- acoular/sources.py +920 -724
- acoular/spectra.py +353 -363
- acoular/tbeamform.py +416 -416
- acoular/tfastfuncs.py +180 -104
- acoular/tools/__init__.py +25 -0
- acoular/tools/aiaa.py +185 -0
- acoular/tools/helpers.py +189 -0
- acoular/tools/metrics.py +165 -0
- acoular/tprocess.py +1240 -1182
- acoular/traitsviews.py +513 -501
- acoular/trajectory.py +50 -52
- acoular/version.py +5 -6
- acoular/xml/minidsp_uma-16.xml +20 -0
- acoular/xml/{minidsp_uma16.xml → minidsp_uma-16_mirrored.xml} +3 -0
- {acoular-24.3.dist-info → acoular-24.7.dist-info}/METADATA +58 -39
- acoular-24.7.dist-info/RECORD +50 -0
- {acoular-24.3.dist-info → acoular-24.7.dist-info}/WHEEL +1 -1
- acoular-24.7.dist-info/licenses/LICENSE +28 -0
- acoular/fileimport.py +0 -380
- acoular/nidaqimport.py +0 -273
- acoular/tests/reference_data/BeamformerBase.npy +0 -0
- acoular/tests/reference_data/BeamformerBaseFalse1.npy +0 -0
- acoular/tests/reference_data/BeamformerBaseFalse2.npy +0 -0
- acoular/tests/reference_data/BeamformerBaseFalse3.npy +0 -0
- acoular/tests/reference_data/BeamformerBaseFalse4.npy +0 -0
- acoular/tests/reference_data/BeamformerBaseTrue1.npy +0 -0
- acoular/tests/reference_data/BeamformerBaseTrue2.npy +0 -0
- acoular/tests/reference_data/BeamformerBaseTrue3.npy +0 -0
- acoular/tests/reference_data/BeamformerBaseTrue4.npy +0 -0
- acoular/tests/reference_data/BeamformerCMFLassoLarsBIC.npy +0 -0
- acoular/tests/reference_data/BeamformerCMFNNLS.npy +0 -0
- acoular/tests/reference_data/BeamformerCapon.npy +0 -0
- acoular/tests/reference_data/BeamformerClean.npy +0 -0
- acoular/tests/reference_data/BeamformerCleansc.npy +0 -0
- acoular/tests/reference_data/BeamformerCleant.npy +0 -0
- acoular/tests/reference_data/BeamformerCleantSq.npy +0 -0
- acoular/tests/reference_data/BeamformerCleantSqTraj.npy +0 -0
- acoular/tests/reference_data/BeamformerCleantTraj.npy +0 -0
- acoular/tests/reference_data/BeamformerDamas.npy +0 -0
- acoular/tests/reference_data/BeamformerDamasPlus.npy +0 -0
- acoular/tests/reference_data/BeamformerEig.npy +0 -0
- acoular/tests/reference_data/BeamformerEigFalse1.npy +0 -0
- acoular/tests/reference_data/BeamformerEigFalse2.npy +0 -0
- acoular/tests/reference_data/BeamformerEigFalse3.npy +0 -0
- acoular/tests/reference_data/BeamformerEigFalse4.npy +0 -0
- acoular/tests/reference_data/BeamformerEigTrue1.npy +0 -0
- acoular/tests/reference_data/BeamformerEigTrue2.npy +0 -0
- acoular/tests/reference_data/BeamformerEigTrue3.npy +0 -0
- acoular/tests/reference_data/BeamformerEigTrue4.npy +0 -0
- acoular/tests/reference_data/BeamformerFunctional.npy +0 -0
- acoular/tests/reference_data/BeamformerGIB.npy +0 -0
- acoular/tests/reference_data/BeamformerGridlessOrth.npy +0 -0
- acoular/tests/reference_data/BeamformerMusic.npy +0 -0
- acoular/tests/reference_data/BeamformerOrth.npy +0 -0
- acoular/tests/reference_data/BeamformerSODIX.npy +0 -0
- acoular/tests/reference_data/BeamformerTime.npy +0 -0
- acoular/tests/reference_data/BeamformerTimeSq.npy +0 -0
- acoular/tests/reference_data/BeamformerTimeSqTraj.npy +0 -0
- acoular/tests/reference_data/BeamformerTimeTraj.npy +0 -0
- acoular/tests/reference_data/Environment.npy +0 -0
- acoular/tests/reference_data/Example1_numerical_values_testsum.h5 +0 -0
- acoular/tests/reference_data/FiltFiltOctave__.npy +0 -0
- acoular/tests/reference_data/FiltFiltOctave_band_100_0_fraction_Thirdoctave_.npy +0 -0
- acoular/tests/reference_data/FiltFreqWeight_weight_A_.npy +0 -0
- acoular/tests/reference_data/FiltFreqWeight_weight_C_.npy +0 -0
- acoular/tests/reference_data/FiltFreqWeight_weight_Z_.npy +0 -0
- acoular/tests/reference_data/FiltOctave__.npy +0 -0
- acoular/tests/reference_data/FiltOctave_band_100_0_fraction_Thirdoctave_.npy +0 -0
- acoular/tests/reference_data/Filter__.npy +0 -0
- acoular/tests/reference_data/GeneralFlowEnvironment.npy +0 -0
- acoular/tests/reference_data/OctaveFilterBank__.npy +0 -0
- acoular/tests/reference_data/OpenJet.npy +0 -0
- acoular/tests/reference_data/PointSource.npy +0 -0
- acoular/tests/reference_data/PowerSpectra_csm.npy +0 -0
- acoular/tests/reference_data/PowerSpectra_ev.npy +0 -0
- acoular/tests/reference_data/RotatingFlow.npy +0 -0
- acoular/tests/reference_data/SlotJet.npy +0 -0
- acoular/tests/reference_data/TimeAverage__.npy +0 -0
- acoular/tests/reference_data/TimeCumAverage__.npy +0 -0
- acoular/tests/reference_data/TimeExpAverage_weight_F_.npy +0 -0
- acoular/tests/reference_data/TimeExpAverage_weight_I_.npy +0 -0
- acoular/tests/reference_data/TimeExpAverage_weight_S_.npy +0 -0
- acoular/tests/reference_data/TimeInOut__.npy +0 -0
- acoular/tests/reference_data/TimePower__.npy +0 -0
- acoular/tests/reference_data/TimeReverse__.npy +0 -0
- acoular/tests/reference_data/UniformFlowEnvironment.npy +0 -0
- acoular/tests/reference_data/beamformer_traj_time_data.h5 +0 -0
- acoular/tests/run_tests.sh +0 -18
- acoular/tests/run_tests_osx.sh +0 -16
- acoular/tests/test.npy +0 -0
- acoular/tests/test_beamformer_results.py +0 -213
- acoular/tests/test_classes.py +0 -60
- acoular/tests/test_digest.py +0 -125
- acoular/tests/test_environments.py +0 -73
- acoular/tests/test_example1.py +0 -124
- acoular/tests/test_grid.py +0 -92
- acoular/tests/test_integrate.py +0 -102
- acoular/tests/test_signals.py +0 -60
- acoular/tests/test_sources.py +0 -65
- acoular/tests/test_spectra.py +0 -38
- acoular/tests/test_timecache.py +0 -35
- acoular/tests/test_tprocess.py +0 -90
- acoular/tests/test_traj_beamformer_results.py +0 -164
- acoular/tests/unsupported/SpeedComparison/OvernightTestcasesBeamformer_nMics32_nGridPoints100_nFreqs4_nTrials10.png +0 -0
- acoular/tests/unsupported/SpeedComparison/cythonBeamformer.pyx +0 -237
- acoular/tests/unsupported/SpeedComparison/mainForCython.py +0 -103
- acoular/tests/unsupported/SpeedComparison/mainForParallelJit.py +0 -143
- acoular/tests/unsupported/SpeedComparison/setupCythonOpenMP.py +0 -63
- acoular/tests/unsupported/SpeedComparison/sharedFunctions.py +0 -153
- acoular/tests/unsupported/SpeedComparison/timeOverNMics_AllImportantMethods.png +0 -0
- acoular/tests/unsupported/SpeedComparison/timeOverNMics_faverage.png +0 -0
- acoular/tests/unsupported/SpeedComparison/vglOptimierungFAverage.py +0 -204
- acoular/tests/unsupported/SpeedComparison/vglOptimierungGaussSeidel.py +0 -182
- acoular/tests/unsupported/SpeedComparison/vglOptimierungR_BEAMFULL_INVERSE.py +0 -764
- acoular/tests/unsupported/SpeedComparison/vglOptimierungR_BEAM_OS.py +0 -231
- acoular/tests/unsupported/SpeedComparison/whatsFastestWayFor_absASquared.py +0 -48
- acoular/tests/unsupported/functionalBeamformer.py +0 -123
- acoular/tests/unsupported/precisionTest.py +0 -153
- acoular/tests/unsupported/validationOfBeamformerFuncsPOSTAcoularIntegration.py +0 -254
- acoular/tests/unsupported/validationOfBeamformerFuncsPREeAcoularIntegration.py +0 -531
- acoular/tools.py +0 -422
- acoular-24.3.dist-info/RECORD +0 -148
- acoular-24.3.dist-info/licenses/LICENSE +0 -29
- {acoular-24.3.dist-info → acoular-24.7.dist-info}/licenses/AUTHORS.rst +0 -0
|
@@ -1,764 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python2
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Comparison of different optimization approaches to the 'r_beamfull_inverse' method.
|
|
5
|
-
Compared are: Numpy (matrix-vector-calculations), Numba and Cython.
|
|
6
|
-
|
|
7
|
-
1. Background:
|
|
8
|
-
Currently (python=2) performance-critical acoular methods (e.g.
|
|
9
|
-
faverage, all freqency-domain-beamformers, ...) are optimized via Scipy.weave
|
|
10
|
-
which translates code to c++, including compiling. Those executables can then
|
|
11
|
-
be imported in python.
|
|
12
|
-
Scipy.weave isn't supported anymore in python=3. Furthermore the executables of
|
|
13
|
-
already with Scipy.weave (python=2) compiled code cannot be imported in python=3.
|
|
14
|
-
|
|
15
|
-
2. Structure of comparison:
|
|
16
|
-
The benchmark in both errors and time consumption is always the Scipy.weave build,
|
|
17
|
-
OpenMP optimized, c++ compiled code. Especially the relative and absolute inf-norm
|
|
18
|
-
errors in the plots refer to the outputs of this function.
|
|
19
|
-
|
|
20
|
-
3. Remarks to the code:
|
|
21
|
-
In various codes below there are repeating patterns which may need some explanation:
|
|
22
|
-
a. There is a cast from 64-bit double precision to 32-bit precision of 'temp3',
|
|
23
|
-
which is the argument to the exp() when calculating the steering vectors
|
|
24
|
-
-> This down-cast shortens the series expansion of exp() drastically which
|
|
25
|
-
leads to faster calculations while having acceptable errors. In fact if
|
|
26
|
-
there is no down-cast, the relative error between otherwise identical
|
|
27
|
-
methods is about 10^-8.
|
|
28
|
-
b. The exp() (when calculating the steering vector, see a.) is mostly replaced
|
|
29
|
-
by a direct calculating of 'cos() - 1j*sin()', which can be done because
|
|
30
|
-
the input 'temp3' of exp(temp3) is pure imaginary. Because of this the
|
|
31
|
-
calculation of exp(0)=1 in 'exp(0 - 1j*a) = exp(0) * (cos(a) - 1j*sin(a))'
|
|
32
|
-
can be spared. This leads to further speed improvements.
|
|
33
|
-
|
|
34
|
-
4. Remark on the use of Cython:
|
|
35
|
-
See file 'cythonBeamformer.pyx' for remarks on Cython. It showed that at the
|
|
36
|
-
moment Cython doesn't work to well for the beamformer case.
|
|
37
|
-
|
|
38
|
-
5. Others:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
Versions used in this script:
|
|
42
|
-
numba=0.34.0
|
|
43
|
-
python=2.7.13
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# multiplizieren mit nMics erfolgt ausserhalb, sonst hier
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
"""
|
|
52
|
-
import time as tm
|
|
53
|
-
import threading
|
|
54
|
-
import gc
|
|
55
|
-
|
|
56
|
-
import numpy as np
|
|
57
|
-
from numba import jit, guvectorize, complex128, complex64, float64, float32, void, uint64, njit, prange
|
|
58
|
-
|
|
59
|
-
import sharedFunctions as shFncs
|
|
60
|
-
from cythonBeamformer import beamformerCython, beamformerCythonNOTparallel # created with cython
|
|
61
|
-
from beamformer import r_beamfull_inverse # The benchmark (created with scipy.weave)
|
|
62
|
-
from beamformer_withoutMP import r_beamfull_inverse_OhneMP # also created with scipy.weave, but WITHOUT using multiple cores via OpenMP
|
|
63
|
-
|
|
64
|
-
#%% Formulate the Beamformer as VECTOR * MATRIX * VECTOR product
|
|
65
|
-
def vectorized(csm, e, h, r0, rm, kj):
|
|
66
|
-
""" Uses Numpys fast array operations, distributed via the mkl-package.
|
|
67
|
-
Those oparations are already optimized and use all available physical cores.
|
|
68
|
-
"""
|
|
69
|
-
nFreqs = csm.shape[0]
|
|
70
|
-
nGridPoints = r0.shape[0]
|
|
71
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.complex128)
|
|
72
|
-
for cntFreqs in xrange(nFreqs):
|
|
73
|
-
for cntGrid in xrange(nGridPoints):
|
|
74
|
-
steeringVector = rm[cntGrid, :] / r0[cntGrid] \
|
|
75
|
-
* np.exp(-1j * kj[cntFreqs].imag * (rm[cntGrid, :] - r0[cntGrid]))
|
|
76
|
-
beamformOutput[cntFreqs, cntGrid] = np.inner(np.inner(steeringVector.T.conj(), csm[cntFreqs, :, :]), steeringVector)
|
|
77
|
-
return beamformOutput.real
|
|
78
|
-
|
|
79
|
-
def vectorizedOptimized(csm, e, h, r0, rm, kj):
|
|
80
|
-
""" Same as 'vectorized' but including both 3.a. & 3.b. of the documentation
|
|
81
|
-
string at the beginning of this file. In opposite to the numba-optimized
|
|
82
|
-
methods below, the use of 'cos() - 1j*sin()' instead of 'exp()' (see 3.a.)
|
|
83
|
-
doesn't seem to have any speed improvement here.
|
|
84
|
-
"""
|
|
85
|
-
nFreqs = csm.shape[0]
|
|
86
|
-
nGridPoints = r0.shape[0]
|
|
87
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.complex128)
|
|
88
|
-
for cntFreqs in xrange(nFreqs):
|
|
89
|
-
kjj = kj[cntFreqs].imag
|
|
90
|
-
for cntGrid in xrange(nGridPoints):
|
|
91
|
-
temp3 = np.float32(kjj * (rm[cntGrid, :] - r0[cntGrid]))
|
|
92
|
-
steeringVector = rm[cntGrid, :] / r0[cntGrid] * (np.cos(temp3) - 1j * np.sin(temp3))
|
|
93
|
-
beamformOutput[cntFreqs, cntGrid] = np.vdot(steeringVector, np.dot(steeringVector, csm[cntFreqs, :, :]))
|
|
94
|
-
return beamformOutput.real
|
|
95
|
-
|
|
96
|
-
@jit
|
|
97
|
-
def vectorized_NumbaJitOnly(csm, e, h, r0, rm, kj):
|
|
98
|
-
""" Identical code to vectorized. Just decorated with the most basic
|
|
99
|
-
jit-optimization routine. If jit is able to translate all variables into
|
|
100
|
-
primitive datatypes (NOT the native python objects) it will do that. If not,
|
|
101
|
-
jit will fall back into 'Object mode' (native python objects) which will
|
|
102
|
-
mostly be much slower.
|
|
103
|
-
"""
|
|
104
|
-
nFreqs = csm.shape[0]
|
|
105
|
-
nGridPoints = r0.shape[0]
|
|
106
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.complex128)
|
|
107
|
-
for cntFreqs in xrange(nFreqs):
|
|
108
|
-
for cntGrid in xrange(nGridPoints):
|
|
109
|
-
steeringVector = rm[cntGrid, :] / r0[cntGrid] \
|
|
110
|
-
* np.exp(-1j * kj[cntFreqs].imag * (rm[cntGrid, :] - r0[cntGrid]))
|
|
111
|
-
beamformOutput[cntFreqs, cntGrid] = np.inner(np.inner(steeringVector.T.conj(), csm[cntFreqs, :, :]), steeringVector)
|
|
112
|
-
return beamformOutput.real
|
|
113
|
-
|
|
114
|
-
@jit(nopython=True) # same as directly calling @njit
|
|
115
|
-
def vectorized_NumbaJit_nopythonTrue(csm, e, h, r0, rm, kj):
|
|
116
|
-
""" In addition to 'vectorized_NumbaJitOnly' the nopython=True (or simply
|
|
117
|
-
@njit for numby>=0.34.) makes shure that if jit cannot translate the code
|
|
118
|
-
into primitive datatypes, it will NOT fall back into object mode but
|
|
119
|
-
instead returns an error.
|
|
120
|
-
"""
|
|
121
|
-
nFreqs = csm.shape[0]
|
|
122
|
-
nGridPoints = r0.shape[0]
|
|
123
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.complex128)
|
|
124
|
-
for cntFreqs in xrange(nFreqs):
|
|
125
|
-
for cntGrid in xrange(nGridPoints):
|
|
126
|
-
steeringVector = rm[cntGrid, :] / r0[cntGrid] * np.exp(-1j * kj[cntFreqs].imag * (rm[cntGrid, :] - r0[cntGrid]))
|
|
127
|
-
|
|
128
|
-
#==============================================================================
|
|
129
|
-
# #Not all numpy functions are supported in jit. It took some
|
|
130
|
-
# #tries to figure out how to implement the easy np.inner used in
|
|
131
|
-
# #'vectorized' into jit-supported functions.
|
|
132
|
-
#
|
|
133
|
-
|
|
134
|
-
# peer = np.inner(np.inner(steeringVector.T.conjugate(), csm[cntFreqs, :, :]), steeringVector)
|
|
135
|
-
# peer2 = np.vdot(steeringVector, np.dot(steeringVector, csm[cntFreqs, :, :])) # Der scheint zu klappen
|
|
136
|
-
# peer3 = np.vdot(steeringVector, np.dot(csm[cntFreqs, :, :], steeringVector))
|
|
137
|
-
#
|
|
138
|
-
# versuch1 = np.inner(steeringVector, csm[cntFreqs, :, :])
|
|
139
|
-
# versuch12 = np.inner(steeringVector.conjugate(), csm[cntFreqs, :, :])
|
|
140
|
-
# versuch2 = np.dot(steeringVector, csm[cntFreqs, :, :]) # complex konj von versuch 12
|
|
141
|
-
# versuch3 = np.dot(csm[cntFreqs, :, :], steeringVector) # gleiche wie versuch1
|
|
142
|
-
# versuch4 = np.dot(steeringVector.conjugate(), csm[cntFreqs, :, :]) # is das kompl conjugierte zu versuch1, versuch3
|
|
143
|
-
# versuch5 = np.dot(csm[cntFreqs, :, :], steeringVector.conjugate()) # is das gleiche wie versuch12
|
|
144
|
-
# ##--> Anscheinend ist die Syntax fuer x^H * A = dot(A, x.conj)
|
|
145
|
-
beamformOutput[cntFreqs, cntGrid] = np.vdot(steeringVector, np.dot(steeringVector, csm[cntFreqs, :, :])) # This works
|
|
146
|
-
#==============================================================================
|
|
147
|
-
return beamformOutput.real
|
|
148
|
-
|
|
149
|
-
@njit(float64[:,:](complex128[:,:,:], complex128[:], float64[:,:], float64[:], float64[:,:], complex128[:]))
|
|
150
|
-
def vectorized_NumbaJit_nopythonTrue_DeclareInput(csm, e, h, r0, rm, kj):
|
|
151
|
-
""" In addition to 'vectorized_NumbaJit_nopythonTrue' the in-/output of the
|
|
152
|
-
method are declared in the decorator, which normally leads to speed
|
|
153
|
-
improvements (even though they're very little in this particular case).
|
|
154
|
-
"""
|
|
155
|
-
nFreqs = csm.shape[0]
|
|
156
|
-
nGridPoints = r0.shape[0]
|
|
157
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.float64)
|
|
158
|
-
for cntFreqs in xrange(nFreqs):
|
|
159
|
-
for cntGrid in xrange(nGridPoints):
|
|
160
|
-
steeringVector = rm[cntGrid, :] / r0[cntGrid] * np.exp(-1j * kj[cntFreqs].imag * (rm[cntGrid, :] - r0[cntGrid]))
|
|
161
|
-
beamformOutput[cntFreqs, cntGrid] = np.vdot(steeringVector, np.dot(steeringVector, csm[cntFreqs, :, :])).real
|
|
162
|
-
return beamformOutput
|
|
163
|
-
|
|
164
|
-
@njit(float64[:,:](complex128[:,:,:], float64[:], float64[:,:], complex128[:]), parallel=True)
|
|
165
|
-
def vectorizedOptimized_NumbaJit_Parallel(csm, r0, rm, kj):
|
|
166
|
-
""" The parallel=True flag turns on an automized parallezation process.
|
|
167
|
-
When one wants to manually parallelize a certain loop one can do so by
|
|
168
|
-
using prange instead of xrange/range. BUT in this method the prange
|
|
169
|
-
produced errors. Maybe thats because the numpy package performs
|
|
170
|
-
parallelization itself, which is then in conflict with prange.
|
|
171
|
-
"""
|
|
172
|
-
nFreqs = csm.shape[0]
|
|
173
|
-
nGridPoints = r0.shape[0]
|
|
174
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.float64)
|
|
175
|
-
for cntFreqs in xrange(nFreqs):
|
|
176
|
-
kjj = kj[cntFreqs].imag
|
|
177
|
-
for cntGrid in xrange(nGridPoints): # error when trying with prange
|
|
178
|
-
temp3 = (kjj * (rm[cntGrid, :] - r0[cntGrid]))
|
|
179
|
-
steeringVector = rm[cntGrid, :] / r0[cntGrid] * (np.cos(temp3) - 1j * np.sin(temp3))
|
|
180
|
-
beamformOutput[cntFreqs, cntGrid] = (np.vdot(steeringVector, np.dot(steeringVector, csm[cntFreqs, :, :]))).real
|
|
181
|
-
return beamformOutput
|
|
182
|
-
|
|
183
|
-
#%% Formulate the Beamformer as LOOPS
|
|
184
|
-
def loops_exactCopyOfCPP(csm, e, h, r0, rm, kj):
|
|
185
|
-
""" A python copy of the current benchmark function, created with scipy.weave
|
|
186
|
-
"""
|
|
187
|
-
nFreqs = csm.shape[0]
|
|
188
|
-
nGridPoints = r0.shape[0]
|
|
189
|
-
nMics = csm.shape[1]
|
|
190
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.complex128)
|
|
191
|
-
steerVec = np.zeros((nMics), np.complex128)
|
|
192
|
-
|
|
193
|
-
for cntFreqs in xrange(nFreqs):
|
|
194
|
-
kjj = kj[cntFreqs].imag
|
|
195
|
-
for cntGrid in xrange(nGridPoints):
|
|
196
|
-
rs = 0
|
|
197
|
-
r01 = r0[cntGrid]
|
|
198
|
-
|
|
199
|
-
# Calculating of Steering-Vectors
|
|
200
|
-
for cntMics in xrange(nMics):
|
|
201
|
-
rm1 = rm[cntGrid, cntMics]
|
|
202
|
-
rs += 1.0 / (rm1**2)
|
|
203
|
-
temp3 = np.float32(kjj * (rm1 - r01))
|
|
204
|
-
steerVec[cntMics] = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1
|
|
205
|
-
rs = r01 ** 2
|
|
206
|
-
|
|
207
|
-
# Calculating of the matrix - vector - multiplication
|
|
208
|
-
temp1 = 0.0
|
|
209
|
-
for cntMics in xrange(nMics):
|
|
210
|
-
temp2 = 0.0
|
|
211
|
-
for cntMics2 in xrange(cntMics):
|
|
212
|
-
temp2 += csm[cntFreqs, cntMics2, cntMics] * steerVec[cntMics2]
|
|
213
|
-
temp1 += 2 * (temp2 * steerVec[cntMics].conjugate()).real
|
|
214
|
-
temp1 += (csm[cntFreqs, cntMics, cntMics] * np.conjugate(steerVec[cntMics]) * steerVec[cntMics]).real
|
|
215
|
-
beamformOutput[cntFreqs, cntGrid] = temp1 / rs
|
|
216
|
-
return beamformOutput
|
|
217
|
-
|
|
218
|
-
@njit(float64[:,:](complex128[:,:,:], complex128[:], float64[:,:], float64[:], float64[:,:], complex128[:]))
|
|
219
|
-
def loops_NumbaJit_nopythonTrue_exactCopyOfCPP(csm, e, h, r0, rm, kj):
|
|
220
|
-
""" See 'vectorized_NumbaJit_nopythonTrue_DeclareInput' for explenation of
|
|
221
|
-
the numba decorator.
|
|
222
|
-
"""
|
|
223
|
-
nFreqs = csm.shape[0]
|
|
224
|
-
nGridPoints = r0.shape[0]
|
|
225
|
-
nMics = csm.shape[1]
|
|
226
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.float64)
|
|
227
|
-
steerVec = np.zeros((nMics), np.complex128)
|
|
228
|
-
|
|
229
|
-
for cntFreqs in xrange(nFreqs):
|
|
230
|
-
kjj = kj[cntFreqs].imag
|
|
231
|
-
for cntGrid in xrange(nGridPoints):
|
|
232
|
-
rs = 0
|
|
233
|
-
r01 = r0[cntGrid]
|
|
234
|
-
for cntMics in xrange(nMics):
|
|
235
|
-
rm1 = rm[cntGrid, cntMics]
|
|
236
|
-
rs += 1.0 / (rm1**2)
|
|
237
|
-
temp3 = np.float32(kjj * (rm1 - r01))
|
|
238
|
-
steerVec[cntMics] = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1
|
|
239
|
-
rs = r01 ** 2
|
|
240
|
-
|
|
241
|
-
temp1 = 0.0
|
|
242
|
-
for cntMics in xrange(nMics):
|
|
243
|
-
temp2 = 0.0
|
|
244
|
-
for cntMics2 in xrange(cntMics):
|
|
245
|
-
temp2 += csm[cntFreqs, cntMics2, cntMics] * steerVec[cntMics2]
|
|
246
|
-
temp1 += 2 * (temp2 * steerVec[cntMics].conjugate()).real
|
|
247
|
-
temp1 += (csm[cntFreqs, cntMics, cntMics] * np.conjugate(steerVec[cntMics]) * steerVec[cntMics]).real
|
|
248
|
-
beamformOutput[cntFreqs, cntGrid] = temp1 / rs
|
|
249
|
-
return beamformOutput
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
@njit(float64[:,:](complex128[:,:,:], complex128[:], float64[:,:], float64[:], float64[:,:], complex128[:]), parallel=True)
|
|
253
|
-
def loops_NumbaJit_parallel_FirstWritingOfSteer(csm, e, h, r0, rm, kj):
|
|
254
|
-
""" This method implements the parallelized loop (prange) over the
|
|
255
|
-
Gridpoints, which is a direct implementation of the currently used
|
|
256
|
-
c++ method created with scipy.wave.
|
|
257
|
-
|
|
258
|
-
Very strange: Just like with Cython, this implementation (prange over Gridpoints)
|
|
259
|
-
produces wrong results. If one doesn't parallelize -> everything is good
|
|
260
|
-
(just like with Cython). Maybe Cython and Numba.jit use the same interpreter
|
|
261
|
-
to generate OpenMP-parallelizable code.
|
|
262
|
-
|
|
263
|
-
BUT: If one uncomments the 'steerVec' declaration in the prange-loop over the
|
|
264
|
-
gridpoints an error occurs. After commenting the line again and executing
|
|
265
|
-
the script once more, THE BEAMFORMER-RESULTS ARE CORRECT (for repeated tries).
|
|
266
|
-
Funny enough the method is now twice as slow in comparison to the
|
|
267
|
-
'wrong version' (before invoking the error).
|
|
268
|
-
|
|
269
|
-
A workaround is given by 'loops_NumbaJit_parallel', which is much slower,
|
|
270
|
-
because the sterring vector is calculated redundantly.
|
|
271
|
-
"""
|
|
272
|
-
nFreqs = csm.shape[0]
|
|
273
|
-
nGridPoints = r0.shape[0]
|
|
274
|
-
nMics = csm.shape[1]
|
|
275
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.float64)
|
|
276
|
-
steerVec = np.zeros((nMics), np.complex128)
|
|
277
|
-
|
|
278
|
-
for cntFreqs in xrange(nFreqs):
|
|
279
|
-
kjj = kj[cntFreqs].imag
|
|
280
|
-
for cntGrid in prange(nGridPoints):
|
|
281
|
-
# steerVec = np.zeros((nMics), np.complex128) # This is the line that has to be uncommented (see this methods documentation comment)
|
|
282
|
-
rs = 0
|
|
283
|
-
r01 = r0[cntGrid]
|
|
284
|
-
|
|
285
|
-
for cntMics in xrange(nMics):
|
|
286
|
-
rm1 = rm[cntGrid, cntMics]
|
|
287
|
-
rs += 1.0 / (rm1**2)
|
|
288
|
-
temp3 = np.float32(kjj * (rm1 - r01))
|
|
289
|
-
steerVec[cntMics] = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1
|
|
290
|
-
rs = r01 ** 2
|
|
291
|
-
|
|
292
|
-
temp1 = 0.0
|
|
293
|
-
for cntMics in xrange(nMics):
|
|
294
|
-
temp2 = 0.0
|
|
295
|
-
for cntMics2 in xrange(cntMics):
|
|
296
|
-
temp2 = temp2 + csm[cntFreqs, cntMics2, cntMics] * steerVec[cntMics2]
|
|
297
|
-
temp1 = temp1 + 2 * (temp2 * steerVec[cntMics].conjugate()).real
|
|
298
|
-
temp1 = temp1 + (csm[cntFreqs, cntMics, cntMics] * np.conjugate(steerVec[cntMics]) * steerVec[cntMics]).real
|
|
299
|
-
|
|
300
|
-
beamformOutput[cntFreqs, cntGrid] = temp1 / rs
|
|
301
|
-
return beamformOutput
|
|
302
|
-
|
|
303
|
-
@njit(float64[:,:](complex128[:,:,:], complex128[:], float64[:,:], float64[:], float64[:,:], complex128[:]), parallel=True)
|
|
304
|
-
def loops_NumbaJit_parallel(csm, e, h, r0, rm, kj):
|
|
305
|
-
""" Workaround for the prange error in jit. See documentation comment of
|
|
306
|
-
'loops_NumbaJit_parallel_FirstWritingOfSteer'.
|
|
307
|
-
For infos on the numba decorator see 'vectorizedOptimized_NumbaJit_Parallel'
|
|
308
|
-
"""
|
|
309
|
-
nFreqs = csm.shape[0]
|
|
310
|
-
nGridPoints = r0.shape[0]
|
|
311
|
-
nMics = csm.shape[1]
|
|
312
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.float64)
|
|
313
|
-
|
|
314
|
-
for cntFreqs in xrange(nFreqs):
|
|
315
|
-
kjj = kj[cntFreqs].imag
|
|
316
|
-
for cntGrid in prange(nGridPoints):
|
|
317
|
-
r01 = r0[cntGrid]
|
|
318
|
-
rs = r01 ** 2
|
|
319
|
-
|
|
320
|
-
temp1 = 0.0
|
|
321
|
-
for cntMics in xrange(nMics):
|
|
322
|
-
temp2 = 0.0
|
|
323
|
-
rm1 = rm[cntGrid, cntMics]
|
|
324
|
-
temp3 = np.float32(kjj * (rm1 - r01))
|
|
325
|
-
steerVec = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1
|
|
326
|
-
|
|
327
|
-
for cntMics2 in xrange(cntMics):
|
|
328
|
-
rm1 = rm[cntGrid, cntMics2]
|
|
329
|
-
temp3 = np.float32(kjj * (rm1 - r01))
|
|
330
|
-
steerVec1 = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1 # Steering vec is calculated redundantly--> very slow
|
|
331
|
-
temp2 += csm[cntFreqs, cntMics2, cntMics] * steerVec1
|
|
332
|
-
temp1 += 2 * (temp2 * steerVec.conjugate()).real
|
|
333
|
-
temp1 += (csm[cntFreqs, cntMics, cntMics] * steerVec.conjugate() * steerVec).real
|
|
334
|
-
beamformOutput[cntFreqs, cntGrid] = temp1 / rs
|
|
335
|
-
return beamformOutput
|
|
336
|
-
|
|
337
|
-
#%% Multithreading
|
|
338
|
-
|
|
339
|
-
#Due to pythons global interpreter lock (GIL) only one thread can run at a time.
|
|
340
|
-
#This means, that if one wants to make use of multiple cores, one has to release
|
|
341
|
-
#the GIL for concurrently running threads. Numbas jit can release the gil, if
|
|
342
|
-
#all datatypes are primitive (nopython=True) via nogil=True.
|
|
343
|
-
#This doesn't have to be done with all the numba.guvectorized stuff, as the
|
|
344
|
-
#multithreading happens automatically.
|
|
345
|
-
#I tested
|
|
346
|
-
|
|
347
|
-
# VERTIG MACHEN
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
nThreadsGlobal = 2 # einmal mit 2 und einmal mit 4 probieren.. vermutung: die saceh
|
|
354
|
-
# die numpy parallelsieirt (ohne jit) arbeitet eh auf beiden cores-> mehr threads bringt dann nichts meh
|
|
355
|
-
|
|
356
|
-
def vectorized_multiThreading(csm, e, h, r0, rm, kj):
|
|
357
|
-
""" Prepares the Multithreading of 'vectorized_multiThreading_CoreFunction'.
|
|
358
|
-
This method does not free the GIL. As descripted above (beginning of
|
|
359
|
-
Multithreading section) it therefore shouldn't run concurrently (on multiple
|
|
360
|
-
cores). BUT as numpys mkl package organizes concurrency itself (see 'vectorized'),
|
|
361
|
-
this
|
|
362
|
-
"""
|
|
363
|
-
nThreads = nThreadsGlobal
|
|
364
|
-
dataSizePerThread = nGridPoints / nThreads
|
|
365
|
-
startingIndexPerThread = [cnt * dataSizePerThread for cnt in range(nThreads + 1)]
|
|
366
|
-
startingIndexPerThread[-1] = nGridPoints
|
|
367
|
-
threads = [threading.Thread(target=vectorized_multiThreading_CoreFunction, args=(csm, e, h, r0, rm, kj, startingIndexPerThread[cnt], startingIndexPerThread[cnt+1])) for cnt in range(nThreads)]
|
|
368
|
-
for thread in threads:
|
|
369
|
-
thread.start()
|
|
370
|
-
for thread in threads:
|
|
371
|
-
thread.join()
|
|
372
|
-
return h
|
|
373
|
-
def vectorized_multiThreading_CoreFunction(csm, e, h, r0, rm, kj, startPoint, endPoint):
|
|
374
|
-
nFreqs = csm.shape[0]
|
|
375
|
-
for cntFreqs in xrange(nFreqs):
|
|
376
|
-
for cntGrid in xrange(startPoint, endPoint):
|
|
377
|
-
steeringVector = rm[cntGrid, :] / r0[cntGrid] * np.exp(-1j * kj[cntFreqs].imag * (rm[cntGrid, :] - r0[cntGrid]))
|
|
378
|
-
h[cntFreqs, cntGrid] = (np.vdot(steeringVector, np.dot(steeringVector, csm[cntFreqs, :, :]))).real
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
def vectorized_NumbaJit_multiThreading(csm, e, h, r0, rm, kj):
|
|
382
|
-
""" Prepares the Multithreading of 'vectorized_NumbaJit_multiThreading_CoreFunction'
|
|
383
|
-
"""
|
|
384
|
-
nThreads = nThreadsGlobal
|
|
385
|
-
dataSizePerThread = nGridPoints / nThreads
|
|
386
|
-
startingIndexPerThread = [cnt * dataSizePerThread for cnt in range(nThreads + 1)]
|
|
387
|
-
startingIndexPerThread[-1] = nGridPoints
|
|
388
|
-
threads = [threading.Thread(target=vectorized_NumbaJit_multiThreading_CoreFunction, args=(csm, e, h, r0, rm, kj, startingIndexPerThread[cnt], startingIndexPerThread[cnt+1])) for cnt in range(nThreads)]
|
|
389
|
-
for thread in threads:
|
|
390
|
-
thread.start()
|
|
391
|
-
for thread in threads:
|
|
392
|
-
thread.join()
|
|
393
|
-
return h
|
|
394
|
-
@njit(void(complex128[:,:,:], complex128[:], float64[:,:], float64[:], float64[:,:], complex128[:], uint64, uint64), nogil=True)
|
|
395
|
-
def vectorized_NumbaJit_multiThreading_CoreFunction(csm, e, h, r0, rm, kj, startPoint, endPoint):
|
|
396
|
-
nFreqs = csm.shape[0]
|
|
397
|
-
for cntFreqs in xrange(nFreqs):
|
|
398
|
-
for cntGrid in xrange(startPoint, endPoint):
|
|
399
|
-
steeringVector = rm[cntGrid, :] / r0[cntGrid] * np.exp(-1j * kj[cntFreqs].imag * (rm[cntGrid, :] - r0[cntGrid]))
|
|
400
|
-
h[cntFreqs, cntGrid] = (np.vdot(steeringVector, np.dot(steeringVector, csm[cntFreqs, :, :]))).real
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
def loops_NumbaJit_multiThreading(csm, e, h, r0, rm, kj):
|
|
404
|
-
""" Prepares the Multithreading of 'loops_NumbaJit_multiThreading_CoreFunction'.
|
|
405
|
-
Here the cores are used as they should which means:
|
|
406
|
-
You spawn 2 threads -> cpu uses 2 cores,
|
|
407
|
-
You spawn 3 threads -> cpu uses 3 cores...
|
|
408
|
-
"""
|
|
409
|
-
nThreads = nThreadsGlobal
|
|
410
|
-
dataSizePerThread = nGridPoints / nThreads
|
|
411
|
-
startingIndexPerThread = [cnt * dataSizePerThread for cnt in range(nThreads + 1)]
|
|
412
|
-
startingIndexPerThread[-1] = nGridPoints
|
|
413
|
-
threads = [threading.Thread(target=loops_NumbaJit_multiThreading_CoreFunction, args=(csm, e, h, r0, rm, kj, startingIndexPerThread[cnt], startingIndexPerThread[cnt+1])) for cnt in range(nThreads)]
|
|
414
|
-
for thread in threads:
|
|
415
|
-
thread.start()
|
|
416
|
-
for thread in threads:
|
|
417
|
-
thread.join()
|
|
418
|
-
return h
|
|
419
|
-
@njit(void(complex128[:,:,:], complex128[:], float64[:,:], float64[:], float64[:,:], complex128[:], uint64, uint64), nogil=True)
|
|
420
|
-
def loops_NumbaJit_multiThreading_CoreFunction(csm, e, h, r0, rm, kj, startPoint, endPoint):
|
|
421
|
-
nFreqs = csm.shape[0]
|
|
422
|
-
nMics = csm.shape[1]
|
|
423
|
-
steerVec = np.zeros((nMics), np.complex128)
|
|
424
|
-
|
|
425
|
-
for cntFreqs in xrange(nFreqs):
|
|
426
|
-
kjj = kj[cntFreqs].imag
|
|
427
|
-
for cntGrid in xrange(startPoint, endPoint):
|
|
428
|
-
rs = 0
|
|
429
|
-
r01 = r0[cntGrid]
|
|
430
|
-
for cntMics in xrange(nMics):
|
|
431
|
-
rm1 = rm[cntGrid, cntMics]
|
|
432
|
-
rs += 1.0 / (rm1**2)
|
|
433
|
-
temp3 = np.float32(kjj * (rm1 - r01))
|
|
434
|
-
steerVec[cntMics] = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1
|
|
435
|
-
rs = r01 ** 2
|
|
436
|
-
|
|
437
|
-
temp1 = 0.0
|
|
438
|
-
for cntMics in xrange(nMics):
|
|
439
|
-
temp2 = 0.0
|
|
440
|
-
for cntMics2 in xrange(cntMics):
|
|
441
|
-
temp2 += csm[cntFreqs, cntMics2, cntMics] * steerVec[cntMics2]
|
|
442
|
-
temp1 += 2 * (temp2 * steerVec[cntMics].conjugate()).real
|
|
443
|
-
temp1 += (csm[cntFreqs, cntMics, cntMics] * np.conjugate(steerVec[cntMics]) * steerVec[cntMics]).real
|
|
444
|
-
h[cntFreqs, cntGrid] = temp1 / rs
|
|
445
|
-
|
|
446
|
-
#%% NUMBA - GUVECTORIZE
|
|
447
|
-
|
|
448
|
-
@guvectorize([void(complex128[:,:], float64[:], float64[:,:], complex128[:], float64[:])], '(m,m),(g),(g,m),()->(g)', nopython=True, target='parallel')
|
|
449
|
-
def loops_NumbaGuvectorize(csm, r0, rm, kj, h):
|
|
450
|
-
""" Creating a Numpy-Ufunc: Define it for an input which is n-dimensional.
|
|
451
|
-
Then call it with an input which is n+1 dimensional. Python takes care of
|
|
452
|
-
the parallelization over the available cores itself.
|
|
453
|
-
In this case python parallelizes over the frequencies.
|
|
454
|
-
Numbas guvectorize doesn't return a value but has to overwrite an result
|
|
455
|
-
vector passed to the method (in this case 'h') as the last input.
|
|
456
|
-
|
|
457
|
-
Short description of the guvectorize decorator:
|
|
458
|
-
1. Input-Argument: Declaration of output/input datatypes just like
|
|
459
|
-
with jit, but with a obligatary [] around it
|
|
460
|
-
2. '(m,m),(g)...': A symbolic explenation of the input dimensions. In this
|
|
461
|
-
case 'loops_NumbaGuvectorize' is defined for the following input-dim
|
|
462
|
-
(csm[nMics x nMics], r0[nGridpoints], rm[nGridpoints x nMics], kj (a scalar), h[nGridpoints])
|
|
463
|
-
, where 'h' contains the calculated results (identified by '->').
|
|
464
|
-
When you then give an input which tensorical order is exactly one order
|
|
465
|
-
higher then the here made definition (e.g. csm[!nFreqs! x nMics x nMics]),
|
|
466
|
-
numba automatically distributes the new tensor order onto the
|
|
467
|
-
muliple cores (in our case every core computes the beamformer map
|
|
468
|
-
for a single frequency independently of the others)
|
|
469
|
-
3. target: one can compute only on one core (target='CPU'), all available
|
|
470
|
-
cores (target='parallel') or even on graphic cards (target='cuda') (if drivers are installed)
|
|
471
|
-
4. nopython: See jit-decorator, used above
|
|
472
|
-
|
|
473
|
-
See also man page "http://numba.pydata.org/".
|
|
474
|
-
|
|
475
|
-
REMARK: Strangly this seemed only to work, if the added order of CSM was its
|
|
476
|
-
first dimension. E.g. csm[nMics x nMics x nFreqs] didn't seem to work.
|
|
477
|
-
"""
|
|
478
|
-
nGridPoints = r0.shape[0]
|
|
479
|
-
nMics = csm.shape[0]
|
|
480
|
-
steerVec = np.zeros((nMics), np.complex128)
|
|
481
|
-
|
|
482
|
-
kjj = kj[0].imag # If input is scalar, it has to be dereferenced using the 'variable[0]'-syntax
|
|
483
|
-
for cntGrid in xrange(nGridPoints):
|
|
484
|
-
rs = 0.0
|
|
485
|
-
r01 = r0[cntGrid]
|
|
486
|
-
|
|
487
|
-
for cntMics in xrange(nMics):
|
|
488
|
-
rm1 = rm[cntGrid, cntMics]
|
|
489
|
-
rs += 1.0 / (rm1**2)
|
|
490
|
-
temp3 = np.float32(kjj * (rm1 - r01))
|
|
491
|
-
steerVec[cntMics] = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1
|
|
492
|
-
rs = r01 ** 2
|
|
493
|
-
|
|
494
|
-
temp1 = 0.0
|
|
495
|
-
for cntMics in xrange(nMics):
|
|
496
|
-
temp2 = 0.0
|
|
497
|
-
for cntMics2 in xrange(cntMics):
|
|
498
|
-
temp2 += csm[cntMics2, cntMics] * steerVec[cntMics2]
|
|
499
|
-
temp1 += 2 * (temp2 * steerVec[cntMics].conjugate()).real
|
|
500
|
-
temp1 += (csm[cntMics, cntMics] * np.conjugate(steerVec[cntMics]) * steerVec[cntMics]).real
|
|
501
|
-
h[cntGrid] = temp1.real / rs
|
|
502
|
-
|
|
503
|
-
#@njit(float64[:,:](complex128[:,:,:], float64[:], float64[:,:], complex128[:])) # right now it doesn't seem to be supported for jit-optimized methods to call guvectorized subroutines. Maybe this will be changed in the future
|
|
504
|
-
def loops_NumbaGuvectorizeOverGrid(csm, r0, rm, kj):
|
|
505
|
-
""" Similar to 'loops_NumbaGuvectorize', but in this case the UFunc parallelizes
|
|
506
|
-
over the Gridpoints (as it is done in the scipy.weave version). This leads
|
|
507
|
-
to significant speed improvements.
|
|
508
|
-
Thoughts on the speed improvements: I can't see why the pipelining should
|
|
509
|
-
work any more effective in comparison to 'loops_NumbaGuvectorize' (where
|
|
510
|
-
the parallelization is done over the frequency-loop), as in both cases the
|
|
511
|
-
most time is spend in the loop over the gridpoints, so the chain of
|
|
512
|
-
instructions should essentially be the same.
|
|
513
|
-
BUT in 'loops_NumbaGuvectorize' the slice of every currently calculated
|
|
514
|
-
frequency of the CSM is loaded into the shared Cache (e.g. with 4 cores a
|
|
515
|
-
'4 x nMics x nMics'-tensor is loaded into the shared Cache), whereas with
|
|
516
|
-
'loops_NumbaGuvectorizeOverGrid' only a '1 x nMics x nMics'-tensor is
|
|
517
|
-
loaded into the shared Cache. This maybe leads to better managing of resources.
|
|
518
|
-
"""
|
|
519
|
-
nGridPoints = r0.shape[0]
|
|
520
|
-
nFreqs = csm.shape[0]
|
|
521
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.float64)
|
|
522
|
-
for cntFreqs in xrange(nFreqs):
|
|
523
|
-
result = np.zeros(nGridPoints, np.float64)
|
|
524
|
-
loops_NumbaGuvectorizeOverGrid_CoreFunction(csm[cntFreqs, :, :], r0, rm, kj[cntFreqs], result)
|
|
525
|
-
beamformOutput[cntFreqs, :] = result
|
|
526
|
-
return beamformOutput
|
|
527
|
-
|
|
528
|
-
@guvectorize([(complex128[:,:], float64[:], float64[:], complex128[:], float64[:])],
|
|
529
|
-
'(m,m),(),(m),()->()', nopython=True, target='parallel')
|
|
530
|
-
def loops_NumbaGuvectorizeOverGrid_CoreFunction(csm, r0, rm, kj, h):
|
|
531
|
-
""" CoreFunction of 'loops_NumbaGuvectorizeOverGrid', which does the
|
|
532
|
-
parallelization over the gridpoints.
|
|
533
|
-
"""
|
|
534
|
-
nMics = csm.shape[0]
|
|
535
|
-
steerVec = np.zeros((nMics), np.complex128)
|
|
536
|
-
kjj = kj[0].imag
|
|
537
|
-
|
|
538
|
-
rs = 0.0
|
|
539
|
-
r01 = r0[0]
|
|
540
|
-
for cntMics in xrange(nMics):
|
|
541
|
-
rm1 = rm[cntMics]
|
|
542
|
-
rs += 1.0 / (rm1**2)
|
|
543
|
-
temp3 = np.float32(kjj * (rm1 - r01))
|
|
544
|
-
#==============================================================================
|
|
545
|
-
steerVec[cntMics] = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1
|
|
546
|
-
# steerVec[cntMics] = np.exp(-1j * temp3) * rm1 # is analytically the same as the last line
|
|
547
|
-
#
|
|
548
|
-
# With exp(), instead of cos + 1j* sin, the function is noticeable slower
|
|
549
|
-
# AND the relative error is ca 10^-8 (as with those implementations which
|
|
550
|
-
# don't perform the down cast from double to 32-bit-precision)
|
|
551
|
-
# -> Maybe the exp() performs implicitly a cast back to double if its
|
|
552
|
-
# input is imaginary?!
|
|
553
|
-
#==============================================================================
|
|
554
|
-
rs = r01 ** 2
|
|
555
|
-
|
|
556
|
-
temp1 = 0.0
|
|
557
|
-
for cntMics in xrange(nMics):
|
|
558
|
-
temp2 = 0.0
|
|
559
|
-
for cntMics2 in xrange(cntMics):
|
|
560
|
-
temp2 += csm[cntMics2, cntMics] * steerVec[cntMics2]
|
|
561
|
-
temp1 += 2 * (temp2 * steerVec[cntMics].conjugate()).real
|
|
562
|
-
temp1 += (csm[cntMics, cntMics] * np.conjugate(steerVec[cntMics]) * steerVec[cntMics]).real
|
|
563
|
-
h[0] = temp1 / rs
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
def loops_NumbaGuvectorizeOverGridNoCast(csm, r0, rm, kj):
|
|
567
|
-
nGridPoints = r0.shape[0]
|
|
568
|
-
nFreqs = csm.shape[0]
|
|
569
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.float64)
|
|
570
|
-
for cntFreqs in xrange(nFreqs):
|
|
571
|
-
result = np.zeros(nGridPoints, np.float64)
|
|
572
|
-
loops_NumbaGuvectorizeOverGridNoCast_CoreFunction(csm[cntFreqs, :, :], r0, rm, kj[cntFreqs], result)
|
|
573
|
-
beamformOutput[cntFreqs, :] = result
|
|
574
|
-
return beamformOutput
|
|
575
|
-
|
|
576
|
-
@guvectorize([(complex128[:,:], float64[:], float64[:], complex128[:], float64[:])],
|
|
577
|
-
'(m,m),(),(m),()->()', nopython=True, target='parallel')
|
|
578
|
-
def loops_NumbaGuvectorizeOverGridNoCast_CoreFunction(csm, r0, rm, kj, h):
|
|
579
|
-
nMics = csm.shape[0]
|
|
580
|
-
steerVec = np.zeros((nMics), np.complex128)
|
|
581
|
-
kjj = kj[0].imag
|
|
582
|
-
|
|
583
|
-
rs = 0.0
|
|
584
|
-
r01 = r0[0]
|
|
585
|
-
for cntMics in xrange(nMics):
|
|
586
|
-
rm1 = rm[cntMics]
|
|
587
|
-
rs += 1.0 / (rm1**2)
|
|
588
|
-
temp3 = kjj * (rm1 - r01)
|
|
589
|
-
steerVec[cntMics] = (np.cos(temp3) - 1j * np.sin(temp3)) * rm1
|
|
590
|
-
rs = r01 ** 2
|
|
591
|
-
|
|
592
|
-
temp1 = 0.0
|
|
593
|
-
for cntMics in xrange(nMics):
|
|
594
|
-
temp2 = 0.0
|
|
595
|
-
for cntMics2 in xrange(cntMics):
|
|
596
|
-
temp2 += csm[cntMics2, cntMics] * steerVec[cntMics2]
|
|
597
|
-
temp1 += 2 * (temp2 * steerVec[cntMics].conjugate()).real
|
|
598
|
-
temp1 += (csm[cntMics, cntMics] * np.conjugate(steerVec[cntMics]) * steerVec[cntMics]).real
|
|
599
|
-
h[0] = temp1 / rs
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
def loops_NumbaGuvectorizeOverGridAllCalcsIn32Bit(csm, r0, rm, kj):
|
|
603
|
-
nGridPoints = r0.shape[0]
|
|
604
|
-
nFreqs = csm.shape[0]
|
|
605
|
-
beamformOutput = np.zeros((nFreqs, nGridPoints), np.float64)
|
|
606
|
-
for cntFreqs in xrange(nFreqs):
|
|
607
|
-
result = np.zeros(nGridPoints, np.float64)
|
|
608
|
-
# loops_NumbaGuvectorizeOverGridAllCalcsIn32Bit_CoreFunction(csm[cntFreqs, :, :], r0, rm, kj[cntFreqs], result)
|
|
609
|
-
loops_NumbaGuvectorizeOverGridAllCalcsIn32Bit_CoreFunction(np.complex64(csm[cntFreqs, :, :]), np.float32(r0), np.float32(rm), np.complex64(kj[cntFreqs]), np.float32(result))
|
|
610
|
-
beamformOutput[cntFreqs, :] = result
|
|
611
|
-
return beamformOutput
|
|
612
|
-
|
|
613
|
-
@guvectorize([(complex64[:,:], float32[:], float32[:], complex64[:], float32[:])], '(m,m),(),(m),()->()', nopython=True, target='parallel')
|
|
614
|
-
#@guvectorize([(complex128[:,:], float64[:], float64[:], complex128[:], float64[:])], '(m,m),(),(m),()->()', nopython=True, target='parallel')
|
|
615
|
-
def loops_NumbaGuvectorizeOverGridAllCalcsIn32Bit_CoreFunction(csm, r0, rm, kj, h):
|
|
616
|
-
nMics = csm.shape[0]
|
|
617
|
-
steerVec = np.zeros((nMics), np.complex64)
|
|
618
|
-
kjj = np.float32(kj[0].imag)
|
|
619
|
-
|
|
620
|
-
r01 = np.float32(r0[0])
|
|
621
|
-
for cntMics in xrange(nMics):
|
|
622
|
-
rm1 = np.float32(rm[cntMics])
|
|
623
|
-
temp3 = np.float32(kjj * (rm[cntMics] - r01))
|
|
624
|
-
steerVec[cntMics] = np.complex64((np.cos(temp3) - 1j * np.sin(temp3)) * rm1)
|
|
625
|
-
rs = r01 * r01
|
|
626
|
-
|
|
627
|
-
temp1 = np.float32(0.0)
|
|
628
|
-
# temp1 = np.float64(0.0)
|
|
629
|
-
for cntMics in xrange(nMics):
|
|
630
|
-
temp2 = np.complex64(0.0 + 0.0j)
|
|
631
|
-
# temp2 = np.complex128(0.0 + 0.0j)
|
|
632
|
-
for cntMics2 in xrange(cntMics):
|
|
633
|
-
temp2 += csm[cntMics2, cntMics] * steerVec[cntMics2]
|
|
634
|
-
temp1 += 2 * (temp2 * steerVec[cntMics].conjugate()).real
|
|
635
|
-
temp1 += (csm[cntMics, cntMics] * np.conjugate(steerVec[cntMics]) * steerVec[cntMics]).real
|
|
636
|
-
h[0] = temp1 / rs
|
|
637
|
-
#%% MAIN
|
|
638
|
-
listOfMics = [64] #[64, 100, 250, 500, 700, 1000]
|
|
639
|
-
listGridPoints = [5] # [500, 5000, 10000] # Standard value: 12000 # The number of gridpoints doesn't seeme to have to great of an influence
|
|
640
|
-
nTrials = 10
|
|
641
|
-
listOfNFreqs = [20]
|
|
642
|
-
|
|
643
|
-
#==============================================================================
|
|
644
|
-
# The benchmark function 'r_beamfull_inverse' and also other implementations of
|
|
645
|
-
# the beamformer create a lot of overhead, which influences the computational
|
|
646
|
-
# effort of the succeding function. This is mostly the case, if concurrent
|
|
647
|
-
# calculations are done (multiple cores). So often the first trial of a new
|
|
648
|
-
# function takes some time longer than the other trials.
|
|
649
|
-
#==============================================================================
|
|
650
|
-
|
|
651
|
-
#funcsToTrial = [vectorized, vectorizedOptimized, vectorized_NumbaJitOnly, \
|
|
652
|
-
# vectorized_NumbaJit_nopythonTrue, vectorized_NumbaJit_nopythonTrue_DeclareInput, \
|
|
653
|
-
# vectorizedOptimized_NumbaJit_Parallel, \
|
|
654
|
-
# loops_exactCopyOfCPP, loops_NumbaJit_nopythonTrue_exactCopyOfCPP, \
|
|
655
|
-
# loops_NumbaJit_parallel_FirstWritingOfSteer, loops_NumbaJit_parallel, \
|
|
656
|
-
# vectorized_multiThreading, vectorized_NumbaJit_multiThreading, loops_NumbaJit_multiThreading, \
|
|
657
|
-
# loops_NumbaGuvectorize, loops_NumbaGuvectorizeOverGrid, \
|
|
658
|
-
# r_beamfull_inverse_OhneMP, r_beamfull_inverse]
|
|
659
|
-
|
|
660
|
-
#funcsToTrial = [vectorized, vectorizedOptimized, beamformerCythonNOTparallel, loops_NumbaJit_parallel_FirstWritingOfSteer, \
|
|
661
|
-
# vectorized_multiThreading, vectorized_NumbaJit_multiThreading, loops_NumbaJit_multiThreading, \
|
|
662
|
-
# loops_NumbaGuvectorize, loops_NumbaGuvectorizeOverGrid, \
|
|
663
|
-
# r_beamfull_inverse_OhneMP, r_beamfull_inverse]
|
|
664
|
-
|
|
665
|
-
funcsToTrial = [loops_NumbaGuvectorize, loops_NumbaGuvectorizeOverGrid, loops_NumbaGuvectorizeOverGridNoCast, loops_NumbaGuvectorizeOverGridAllCalcsIn32Bit, r_beamfull_inverse]
|
|
666
|
-
|
|
667
|
-
for nMics in listOfMics:
|
|
668
|
-
for nGridPoints in listGridPoints:
|
|
669
|
-
for nFreqs in listOfNFreqs:
|
|
670
|
-
# Init
|
|
671
|
-
print(10*'-' + 'New Test configuration: nMics=%s, nGridpoints=%s, nFreqs=%s' %(nMics, nGridPoints, nFreqs) + 10*'-')
|
|
672
|
-
print(10*'-' + 'Creation of inputInputs' + 10*'-')
|
|
673
|
-
|
|
674
|
-
# Inputs for the beamformer methods:
|
|
675
|
-
# At the moment the beamformer-methods are called once per
|
|
676
|
-
# frequency (CSM is a Matrix, no 3rd-order-tensor)
|
|
677
|
-
# For easier camparability we build the CSM as a 3rd-order-tensor) instead
|
|
678
|
-
csm = np.random.rand(nFreqs, nMics, nMics) + 1j*np.random.rand(nFreqs, nMics, nMics) # cross spectral matrix
|
|
679
|
-
for cntFreqs in range(nFreqs):
|
|
680
|
-
csm[cntFreqs, :, :] += csm[cntFreqs, :, :].T.conj() # make CSM hermetical
|
|
681
|
-
e = np.random.rand(nMics) + 1j*np.random.rand(nMics) # has no usage
|
|
682
|
-
h = np.zeros((nFreqs, nGridPoints)) # results are stored here, if function has no return value
|
|
683
|
-
r0 = np.random.rand(nGridPoints) # distance between gridpoints and middle of array
|
|
684
|
-
rm = np.random.rand(nGridPoints, nMics) # distance between gridpoints and all mics in the array
|
|
685
|
-
kj = np.zeros(nFreqs) + 1j*np.random.rand(nFreqs) # complex wavenumber
|
|
686
|
-
|
|
687
|
-
nameOfFuncsToTrial = map(lambda x: x.__name__, funcsToTrial)
|
|
688
|
-
nameOfFuncsForError = [funcName for funcName in nameOfFuncsToTrial if funcName != 'r_beamfull_inverse']
|
|
689
|
-
maxRelativeDeviation = np.zeros((len(funcsToTrial), nTrials))
|
|
690
|
-
maxAbsoluteDeviation = np.zeros((len(funcsToTrial), nTrials))
|
|
691
|
-
timeConsumption = [[] for _ in range(len(funcsToTrial))]
|
|
692
|
-
indOfBaselineFnc = nameOfFuncsToTrial.index('r_beamfull_inverse')
|
|
693
|
-
|
|
694
|
-
print(10*'-' + 'Onetime calculation of "r_beamfull_inverse" for error reference' + 10*'-')
|
|
695
|
-
r_beamfull_inverse(csm, e, h, r0, rm, kj)
|
|
696
|
-
resultReference = h # For relative/absolute error
|
|
697
|
-
gc.collect()
|
|
698
|
-
|
|
699
|
-
# Testing
|
|
700
|
-
print(10*'-' + 'Testing of functions' + 10*'-')
|
|
701
|
-
cntFunc = 0
|
|
702
|
-
for func in funcsToTrial:
|
|
703
|
-
print(func.__name__)
|
|
704
|
-
for cntTrials in xrange(nTrials):
|
|
705
|
-
h = np.zeros((nFreqs, nGridPoints))
|
|
706
|
-
if func.__name__ == 'r_beamfull_inverse' or func.__name__ == 'r_beamfull_inverse_OhneMP':
|
|
707
|
-
t0 = tm.time()
|
|
708
|
-
func(csm, e, h, r0, rm, kj)
|
|
709
|
-
t1 = tm.time()
|
|
710
|
-
result = h
|
|
711
|
-
# gc.collect()
|
|
712
|
-
elif func.__name__ == 'loops_NumbaGuvectorize':
|
|
713
|
-
t0 = tm.time()
|
|
714
|
-
func(csm, r0, rm, kj, h)
|
|
715
|
-
t1 = tm.time()
|
|
716
|
-
result = h
|
|
717
|
-
elif func.__name__ == 'loops_NumbaGuvectorizeOverGrid' or func.__name__ == 'vectorizedOptimized_NumbaJit_Parallel' or func.__name__ == 'loops_NumbaGuvectorizeOverGridNoCast' or func.__name__ == 'loops_NumbaGuvectorizeOverGridAllCalcsIn32Bit':
|
|
718
|
-
t0 = tm.time()
|
|
719
|
-
output = func(csm, r0, rm, kj)
|
|
720
|
-
t1 = tm.time()
|
|
721
|
-
result = output
|
|
722
|
-
elif func.__name__ == 'beamformerCython' or func.__name__ == 'beamformerCythonNOTparallel':
|
|
723
|
-
t0 = tm.time()
|
|
724
|
-
output = func(csm, r0, rm, kj)
|
|
725
|
-
t1 = tm.time()
|
|
726
|
-
result = np.array(output)
|
|
727
|
-
else:
|
|
728
|
-
t0 = tm.time()
|
|
729
|
-
output = func(csm, e, h, r0, rm, kj)
|
|
730
|
-
t1 = tm.time()
|
|
731
|
-
result = output
|
|
732
|
-
timeConsumption[cntFunc].append(t1 - t0)
|
|
733
|
-
relativeDiffBetweenNewCodeAndRef = (result - resultReference) / (result + resultReference) * 2 # error in relation to the resulting value
|
|
734
|
-
maxRelativeDeviation[cntFunc, cntTrials] = np.amax(np.amax(abs(relativeDiffBetweenNewCodeAndRef), axis=1), axis=0) # relative error in inf-norm
|
|
735
|
-
maxAbsoluteDeviation[cntFunc, cntTrials] = np.amax(np.amax(abs(result - resultReference), axis=1), axis=0) # absolute error in inf-norm
|
|
736
|
-
cntFunc += 1
|
|
737
|
-
factorTimeConsump = [np.mean(timeConsumption[cnt]) for cnt in range(0, len(funcsToTrial))] \
|
|
738
|
-
/ np.mean(timeConsumption[indOfBaselineFnc])
|
|
739
|
-
|
|
740
|
-
# Save the current test-config as .sav
|
|
741
|
-
helpString = 'The order of the variables is: \n nameOfFuncsToTrial \n maxRelativeDeviation'\
|
|
742
|
-
'\n timeConsumption [nFuncs, nTrials] \n nMics \n nGridPoints \n nFreqs '\
|
|
743
|
-
'\n Factor of time consumption (in relation to the original .cpp) \n maxAbsoluteDeviation \n nThreadsGlobal'
|
|
744
|
-
saveTupel = (helpString, nameOfFuncsToTrial, maxRelativeDeviation, timeConsumption,
|
|
745
|
-
nMics, nGridPoints, nFreqs, factorTimeConsump, maxAbsoluteDeviation, nThreadsGlobal)
|
|
746
|
-
stringParameters = 'OvernightTestcasesBeamformer_nMics%s_nGridPoints%s_nFreqs%s_nTrials%s' %(nMics, nGridPoints, nFreqs, nTrials)
|
|
747
|
-
|
|
748
|
-
stringSaveName = 'Peter'
|
|
749
|
-
# stringSaveName = 'Sicherung_DurchgelaufeneTests/Beamformer/AllImportantMethods/' + stringParameters
|
|
750
|
-
# stringSaveName = 'Sicherung_DurchgelaufeneTests/Beamformer/EinflussGridpoints/AMDFX6100/' + stringParameters
|
|
751
|
-
# stringSaveName = 'Sicherung_DurchgelaufeneTests/Beamformer/JitPrange/' + stringParameters
|
|
752
|
-
# stringSaveName = 'Sicherung_DurchgelaufeneTests/Beamformer/Multithreading_02Threads/' + stringParameters
|
|
753
|
-
|
|
754
|
-
shFncs.savingTimeConsumption(stringSaveName, saveTupel) # saving as "stringSaveName.sav"
|
|
755
|
-
|
|
756
|
-
shFncs.plottingOfOvernightTestcasesBeamformer(stringSaveName + '.sav') # plot of the current test-config
|
|
757
|
-
|
|
758
|
-
#==============================================================================
|
|
759
|
-
#The Following use of the numba decorators could lead to less code (as a function
|
|
760
|
-
#body could be used more often) but is also slower, which is why it wasn't used
|
|
761
|
-
#in this comparison.
|
|
762
|
-
# signature = complex128[:,:](complex128[:,:,:], float64[:], float64[:,:])
|
|
763
|
-
# numbaOptimizedFunction= jit(signature, nopython=True)(plainPythonFunction.py_func)
|
|
764
|
-
#==============================================================================
|