neverlib 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neverlib/.claude/settings.local.json +9 -0
- neverlib/Docs/audio_aug/test_volume.ipynb +416 -0
- neverlib/Docs/audio_aug_test/test_volume.ipynb +289 -0
- neverlib/Docs/filter/biquad.ipynb +129 -0
- neverlib/Docs/filter/filter_family.ipynb +450 -0
- neverlib/Docs/filter/highpass.ipynb +139 -0
- neverlib/Docs/filter/scipy_filter_family.ipynb +110 -0
- neverlib/Docs/vad/VAD_Energy.ipynb +167 -0
- neverlib/Docs/vad/VAD_Silero.ipynb +325 -0
- neverlib/Docs/vad/VAD_WebRTC.ipynb +189 -0
- neverlib/Docs/vad/VAD_funasr.ipynb +192 -0
- neverlib/Docs/vad/VAD_rvADfast.ipynb +162 -0
- neverlib/Docs/vad/VAD_statistics.ipynb +532 -0
- neverlib/Docs/vad/VAD_tenVAD.ipynb +292 -0
- neverlib/Docs/vad/VAD_vadlib.ipynb +168 -0
- neverlib/Docs/vad/VAD_whisper.ipynb +404 -0
- neverlib/QA/gen_init.py +218 -0
- neverlib/QA/get_fun.py +19 -0
- neverlib/__init__.py +40 -4
- neverlib/audio_aug/HarmonicDistortion.py +19 -13
- neverlib/audio_aug/__init__.py +82 -12
- neverlib/audio_aug/audio_aug.py +19 -14
- neverlib/audio_aug/clip_aug.py +15 -18
- neverlib/audio_aug/coder_aug.py +44 -24
- neverlib/audio_aug/coder_aug2.py +54 -37
- neverlib/audio_aug/loss_packet_aug.py +7 -7
- neverlib/audio_aug/quant_aug.py +19 -17
- neverlib/data/000_short_enhance.wav +0 -0
- neverlib/data/3956_speech.wav +0 -0
- neverlib/data/3956_sweep.wav +0 -0
- neverlib/data/vad_example.wav +0 -0
- neverlib/data/white.wav +0 -0
- neverlib/data/white_EQ.wav +0 -0
- neverlib/data/white_matched.wav +0 -0
- neverlib/data_analyze/__init__.py +69 -20
- neverlib/data_analyze/dataset_analyzer.py +109 -114
- neverlib/data_analyze/quality_metrics.py +87 -89
- neverlib/data_analyze/rms_distrubution.py +23 -42
- neverlib/data_analyze/spectral_analysis.py +43 -46
- neverlib/data_analyze/statistics.py +76 -76
- neverlib/data_analyze/temporal_features.py +15 -6
- neverlib/data_analyze/visualization.py +208 -144
- neverlib/filter/__init__.py +40 -20
- neverlib/filter/auto_eq/__init__.py +50 -31
- neverlib/filter/auto_eq/de_eq.py +0 -2
- neverlib/filter/common.py +24 -5
- neverlib/metrics/DNSMOS/bak_ovr.onnx +0 -0
- neverlib/metrics/DNSMOS/model_v8.onnx +0 -0
- neverlib/metrics/DNSMOS/sig.onnx +0 -0
- neverlib/metrics/DNSMOS/sig_bak_ovr.onnx +0 -0
- neverlib/metrics/__init__.py +59 -0
- neverlib/metrics/dnsmos.py +4 -15
- neverlib/metrics/pDNSMOS/sig_bak_ovr.onnx +0 -0
- neverlib/metrics/pesq_c/PESQ +0 -0
- neverlib/metrics/pesq_c/dsp.c +553 -0
- neverlib/metrics/pesq_c/dsp.h +138 -0
- neverlib/metrics/pesq_c/pesq.h +294 -0
- neverlib/metrics/pesq_c/pesqdsp.c +1047 -0
- neverlib/metrics/pesq_c/pesqio.c +392 -0
- neverlib/metrics/pesq_c/pesqmain.c +610 -0
- neverlib/metrics/pesq_c/pesqmod.c +1417 -0
- neverlib/metrics/pesq_c/pesqpar.h +297 -0
- neverlib/metrics/snr.py +5 -1
- neverlib/metrics/spec.py +31 -21
- neverlib/metrics/test_pesq.py +0 -4
- neverlib/tests/__init__.py +33 -1
- neverlib/tests/test_imports.py +19 -0
- neverlib/utils/__init__.py +71 -15
- neverlib/utils/audio_split.py +6 -1
- neverlib/utils/checkGPU.py +17 -9
- neverlib/utils/lazy_expose.py +29 -0
- neverlib/utils/utils.py +55 -12
- neverlib/vad/PreProcess.py +66 -66
- neverlib/vad/__init__.py +71 -25
- neverlib/vad/class_get_speech.py +1 -1
- neverlib/vad/class_vad.py +3 -3
- neverlib/vad/img.png +0 -0
- {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/METADATA +1 -1
- {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/RECORD +82 -39
- {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/WHEEL +0 -0
- {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/licenses/LICENSE +0 -0
- {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1417 @@
|
|
|
1
|
+
/*****************************************************************************
|
|
2
|
+
|
|
3
|
+
Perceptual Evaluation of Speech Quality (PESQ)
|
|
4
|
+
ITU-T Recommendation P.862.
|
|
5
|
+
Version 1.2 - 2 August 2002.
|
|
6
|
+
|
|
7
|
+
****************************************
|
|
8
|
+
PESQ Intellectual Property Rights Notice
|
|
9
|
+
****************************************
|
|
10
|
+
|
|
11
|
+
DEFINITIONS:
|
|
12
|
+
------------
|
|
13
|
+
For the purposes of this Intellectual Property Rights Notice
|
|
14
|
+
the terms �Perceptual Evaluation of Speech Quality Algorithm�
|
|
15
|
+
and �PESQ Algorithm� refer to the objective speech quality
|
|
16
|
+
measurement algorithm defined in ITU-T Recommendation P.862;
|
|
17
|
+
the term �PESQ Software� refers to the C-code component of P.862.
|
|
18
|
+
|
|
19
|
+
NOTICE:
|
|
20
|
+
-------
|
|
21
|
+
All copyright, trade marks, trade names, patents, know-how and
|
|
22
|
+
all or any other intellectual rights subsisting in or used in
|
|
23
|
+
connection with including all algorithms, documents and manuals
|
|
24
|
+
relating to the PESQ Algorithm and or PESQ Software are and remain
|
|
25
|
+
the sole property in law, ownership, regulations, treaties and
|
|
26
|
+
patent rights of the Owners identified below. The user may not
|
|
27
|
+
dispute or question the ownership of the PESQ Algorithm and
|
|
28
|
+
or PESQ Software.
|
|
29
|
+
|
|
30
|
+
OWNERS ARE:
|
|
31
|
+
-----------
|
|
32
|
+
|
|
33
|
+
1. British Telecommunications plc (BT), all rights assigned
|
|
34
|
+
to Psytechnics Limited
|
|
35
|
+
2. Royal KPN NV, all rights assigned to OPTICOM GmbH
|
|
36
|
+
|
|
37
|
+
RESTRICTIONS:
|
|
38
|
+
-------------
|
|
39
|
+
|
|
40
|
+
The user cannot:
|
|
41
|
+
|
|
42
|
+
1. alter, duplicate, modify, adapt, or translate in whole or in
|
|
43
|
+
part any aspect of the PESQ Algorithm and or PESQ Software
|
|
44
|
+
2. sell, hire, loan, distribute, dispose or put to any commercial
|
|
45
|
+
use other than those permitted below in whole or in part any
|
|
46
|
+
aspect of the PESQ Algorithm and or PESQ Software
|
|
47
|
+
|
|
48
|
+
PERMITTED USE:
|
|
49
|
+
--------------
|
|
50
|
+
|
|
51
|
+
The user may:
|
|
52
|
+
|
|
53
|
+
1. Use the PESQ Software to:
|
|
54
|
+
i) understand the PESQ Algorithm; or
|
|
55
|
+
ii) evaluate the ability of the PESQ Algorithm to perform
|
|
56
|
+
its intended function of predicting the speech quality
|
|
57
|
+
of a system; or
|
|
58
|
+
iii) evaluate the computational complexity of the PESQ Algorithm,
|
|
59
|
+
with the limitation that none of said evaluations or its
|
|
60
|
+
results shall be used for external commercial use.
|
|
61
|
+
|
|
62
|
+
2. Use the PESQ Software to test if an implementation of the PESQ
|
|
63
|
+
Algorithm conforms to ITU-T Recommendation P.862.
|
|
64
|
+
|
|
65
|
+
3. With the prior written permission of both Psytechnics Limited
|
|
66
|
+
and OPTICOM GmbH, use the PESQ Software in accordance with the
|
|
67
|
+
above Restrictions to perform work that meets all of the following
|
|
68
|
+
criteria:
|
|
69
|
+
i) the work must contribute directly to the maintenance of an
|
|
70
|
+
existing ITU recommendation or the development of a new ITU
|
|
71
|
+
recommendation under an approved ITU Study Item; and
|
|
72
|
+
ii) the work and its results must be fully described in a
|
|
73
|
+
written contribution to the ITU that is presented at a formal
|
|
74
|
+
ITU meeting within one year of the start of the work; and
|
|
75
|
+
iii) neither the work nor its results shall be put to any
|
|
76
|
+
commercial use other than making said contribution to the ITU.
|
|
77
|
+
Said permission will be provided on a case-by-case basis.
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
ANY OTHER USE OR APPLICATION OF THE PESQ SOFTWARE AND/OR THE PESQ
|
|
81
|
+
ALGORITHM WILL REQUIRE A PESQ LICENCE AGREEMENT, WHICH MAY BE OBTAINED
|
|
82
|
+
FROM EITHER OPTICOM GMBH OR PSYTECHNICS LIMITED.
|
|
83
|
+
|
|
84
|
+
EACH COMPANY OFFERS OEM LICENSE AGREEMENTS, WHICH COMBINE OEM
|
|
85
|
+
IMPLEMENTATIONS OF THE PESQ ALGORITHM TOGETHER WITH A PESQ PATENT LICENSE
|
|
86
|
+
AGREEMENT. PESQ PATENT-ONLY LICENSE AGREEMENTS MAY BE OBTAINED FROM OPTICOM.
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
***********************************************************************
|
|
90
|
+
* OPTICOM GmbH * Psytechnics Limited *
|
|
91
|
+
* Am Weichselgarten 7, * Fraser House, 23 Museum Street, *
|
|
92
|
+
* D- 91058 Erlangen, Germany * Ipswich IP1 1HN, England *
|
|
93
|
+
* Phone: +49 (0) 9131 691 160 * Phone: +44 (0) 1473 261 800 *
|
|
94
|
+
* Fax: +49 (0) 9131 691 325 * Fax: +44 (0) 1473 261 880 *
|
|
95
|
+
* E-mail: info@opticom.de, * E-mail: info@psytechnics.com, *
|
|
96
|
+
* www.opticom.de * www.psytechnics.com *
|
|
97
|
+
***********************************************************************
|
|
98
|
+
|
|
99
|
+
Further information is also available from www.pesq.org
|
|
100
|
+
|
|
101
|
+
*****************************************************************************/
|
|
102
|
+
|
|
103
|
+
#include <math.h>
|
|
104
|
+
#include <stdio.h>
|
|
105
|
+
#include "pesq.h"
|
|
106
|
+
#include "pesqpar.h"
|
|
107
|
+
#include "dsp.h"
|
|
108
|
+
|
|
109
|
+
#define CRITERIUM_FOR_SILENCE_OF_5_SAMPLES 500.
|
|
110
|
+
|
|
111
|
+
float Sl, Sp;
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
int *nr_of_hz_bands_per_bark_band;
|
|
116
|
+
double *centre_of_band_bark;
|
|
117
|
+
double *centre_of_band_hz;
|
|
118
|
+
double *width_of_band_bark;
|
|
119
|
+
double *width_of_band_hz;
|
|
120
|
+
double *pow_dens_correction_factor;
|
|
121
|
+
double *abs_thresh_power;
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
void input_filter(
|
|
125
|
+
SIGNAL_INFO * ref_info, SIGNAL_INFO * deg_info, float * ftmp )
|
|
126
|
+
{
|
|
127
|
+
DC_block( (*ref_info).data, (*ref_info).Nsamples );
|
|
128
|
+
DC_block( (*deg_info).data, (*deg_info).Nsamples );
|
|
129
|
+
|
|
130
|
+
apply_filters( (*ref_info).data, (*ref_info).Nsamples );
|
|
131
|
+
apply_filters( (*deg_info).data, (*deg_info).Nsamples );
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
void calc_VAD( SIGNAL_INFO * sinfo )
|
|
135
|
+
{
|
|
136
|
+
apply_VAD( sinfo, sinfo-> data, sinfo-> VAD, sinfo-> logVAD );
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
int id_searchwindows( SIGNAL_INFO * ref_info, SIGNAL_INFO * deg_info,
|
|
140
|
+
ERROR_INFO * err_info )
|
|
141
|
+
{
|
|
142
|
+
long Utt_num = 0;
|
|
143
|
+
long count, VAD_length;
|
|
144
|
+
long this_start;
|
|
145
|
+
int speech_flag = 0;
|
|
146
|
+
float VAD_value;
|
|
147
|
+
long del_deg_start;
|
|
148
|
+
long del_deg_end;
|
|
149
|
+
|
|
150
|
+
VAD_length = ref_info-> Nsamples / Downsample;
|
|
151
|
+
|
|
152
|
+
del_deg_start = MINUTTLENGTH - err_info-> Crude_DelayEst / Downsample;
|
|
153
|
+
del_deg_end =
|
|
154
|
+
((*deg_info).Nsamples - err_info-> Crude_DelayEst) / Downsample -
|
|
155
|
+
MINUTTLENGTH;
|
|
156
|
+
|
|
157
|
+
for (count = 0; count < VAD_length; count++)
|
|
158
|
+
{
|
|
159
|
+
VAD_value = ref_info-> VAD [count];
|
|
160
|
+
|
|
161
|
+
if( (VAD_value > 0.0f) && (speech_flag == 0) )
|
|
162
|
+
{
|
|
163
|
+
speech_flag = 1;
|
|
164
|
+
this_start = count;
|
|
165
|
+
err_info-> UttSearch_Start [Utt_num] = count - SEARCHBUFFER;
|
|
166
|
+
if( err_info-> UttSearch_Start [Utt_num] < 0 )
|
|
167
|
+
err_info-> UttSearch_Start [Utt_num] = 0;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
if( ((VAD_value == 0.0f) || (count == (VAD_length-1))) &&
|
|
171
|
+
(speech_flag == 1) )
|
|
172
|
+
{
|
|
173
|
+
speech_flag = 0;
|
|
174
|
+
err_info-> UttSearch_End [Utt_num] = count + SEARCHBUFFER;
|
|
175
|
+
if( err_info-> UttSearch_End [Utt_num] > VAD_length - 1 )
|
|
176
|
+
err_info-> UttSearch_End [Utt_num] = VAD_length -1;
|
|
177
|
+
|
|
178
|
+
if( ((count - this_start) >= MINUTTLENGTH) &&
|
|
179
|
+
(this_start < del_deg_end) &&
|
|
180
|
+
(count > del_deg_start) )
|
|
181
|
+
Utt_num++;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
err_info-> Nutterances = Utt_num;
|
|
186
|
+
return Utt_num;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
void id_utterances( SIGNAL_INFO * ref_info, SIGNAL_INFO * deg_info,
|
|
190
|
+
ERROR_INFO * err_info )
|
|
191
|
+
{
|
|
192
|
+
long Utt_num = 0;
|
|
193
|
+
long Largest_uttsize = 0;
|
|
194
|
+
long count, VAD_length;
|
|
195
|
+
int speech_flag = 0;
|
|
196
|
+
float VAD_value;
|
|
197
|
+
long this_start;
|
|
198
|
+
long last_end;
|
|
199
|
+
long del_deg_start;
|
|
200
|
+
long del_deg_end;
|
|
201
|
+
|
|
202
|
+
VAD_length = ref_info-> Nsamples / Downsample;
|
|
203
|
+
|
|
204
|
+
del_deg_start = MINUTTLENGTH - err_info-> Crude_DelayEst / Downsample;
|
|
205
|
+
del_deg_end =
|
|
206
|
+
((*deg_info).Nsamples - err_info-> Crude_DelayEst) / Downsample -
|
|
207
|
+
MINUTTLENGTH;
|
|
208
|
+
|
|
209
|
+
for (count = 0; count < VAD_length ; count++)
|
|
210
|
+
{
|
|
211
|
+
VAD_value = ref_info-> VAD [count];
|
|
212
|
+
if( (VAD_value > 0.0f) && (speech_flag == 0) )
|
|
213
|
+
{
|
|
214
|
+
speech_flag = 1;
|
|
215
|
+
this_start = count;
|
|
216
|
+
err_info-> Utt_Start [Utt_num] = count;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if( ((VAD_value == 0.0f) || (count == (VAD_length-1))) &&
|
|
220
|
+
(speech_flag == 1) )
|
|
221
|
+
{
|
|
222
|
+
speech_flag = 0;
|
|
223
|
+
err_info-> Utt_End [Utt_num] = count;
|
|
224
|
+
|
|
225
|
+
if( ((count - this_start) >= MINUTTLENGTH) &&
|
|
226
|
+
(this_start < del_deg_end) &&
|
|
227
|
+
(count > del_deg_start) )
|
|
228
|
+
Utt_num++;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
err_info-> Utt_Start [0] = SEARCHBUFFER;
|
|
233
|
+
err_info-> Utt_End [err_info-> Nutterances-1] = (VAD_length - SEARCHBUFFER);
|
|
234
|
+
|
|
235
|
+
for (Utt_num = 1; Utt_num < err_info-> Nutterances; Utt_num++ )
|
|
236
|
+
{
|
|
237
|
+
this_start = err_info-> Utt_Start [Utt_num];
|
|
238
|
+
last_end = err_info-> Utt_End [Utt_num - 1];
|
|
239
|
+
count = (this_start + last_end) / 2;
|
|
240
|
+
err_info-> Utt_Start [Utt_num] = count;
|
|
241
|
+
err_info-> Utt_End [Utt_num - 1] = count;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
this_start = (err_info-> Utt_Start [0] * Downsample) + err_info-> Utt_Delay [0];
|
|
245
|
+
if( this_start < (SEARCHBUFFER * Downsample) )
|
|
246
|
+
{
|
|
247
|
+
count = SEARCHBUFFER +
|
|
248
|
+
(Downsample - 1 - err_info-> Utt_Delay [0]) / Downsample;
|
|
249
|
+
err_info-> Utt_Start [0] = count;
|
|
250
|
+
}
|
|
251
|
+
last_end = (err_info-> Utt_End [err_info-> Nutterances-1] * Downsample) +
|
|
252
|
+
err_info-> Utt_Delay [err_info-> Nutterances-1];
|
|
253
|
+
if( last_end > ((*deg_info).Nsamples - SEARCHBUFFER * Downsample) )
|
|
254
|
+
{
|
|
255
|
+
count = ( (*deg_info).Nsamples -
|
|
256
|
+
err_info-> Utt_Delay [err_info-> Nutterances-1] ) / Downsample -
|
|
257
|
+
SEARCHBUFFER;
|
|
258
|
+
err_info-> Utt_End [err_info-> Nutterances-1] = count;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
for (Utt_num = 1; Utt_num < err_info-> Nutterances; Utt_num++ )
|
|
262
|
+
{
|
|
263
|
+
this_start =
|
|
264
|
+
(err_info-> Utt_Start [Utt_num] * Downsample) +
|
|
265
|
+
err_info-> Utt_Delay [Utt_num];
|
|
266
|
+
last_end =
|
|
267
|
+
(err_info-> Utt_End [Utt_num - 1] * Downsample) +
|
|
268
|
+
err_info-> Utt_Delay [Utt_num - 1];
|
|
269
|
+
if( this_start < last_end )
|
|
270
|
+
{
|
|
271
|
+
count = (this_start + last_end) / 2;
|
|
272
|
+
this_start =
|
|
273
|
+
(Downsample - 1 + count - err_info-> Utt_Delay [Utt_num]) / Downsample;
|
|
274
|
+
last_end =
|
|
275
|
+
(count - err_info-> Utt_Delay [Utt_num - 1]) / Downsample;
|
|
276
|
+
err_info-> Utt_Start [Utt_num] = this_start;
|
|
277
|
+
err_info-> Utt_End [Utt_num - 1] = last_end;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
for (Utt_num = 0; Utt_num < err_info-> Nutterances; Utt_num++ )
|
|
282
|
+
if( (err_info-> Utt_End [Utt_num] - err_info-> Utt_Start [Utt_num])
|
|
283
|
+
> Largest_uttsize )
|
|
284
|
+
Largest_uttsize =
|
|
285
|
+
err_info-> Utt_End [Utt_num] - err_info-> Utt_Start [Utt_num];
|
|
286
|
+
|
|
287
|
+
err_info-> Largest_uttsize = Largest_uttsize;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
void utterance_split( SIGNAL_INFO * ref_info, SIGNAL_INFO * deg_info,
|
|
291
|
+
ERROR_INFO * err_info, float * ftmp )
|
|
292
|
+
{
|
|
293
|
+
long Utt_id;
|
|
294
|
+
long Utt_DelayEst;
|
|
295
|
+
long Utt_Delay;
|
|
296
|
+
float Utt_DelayConf;
|
|
297
|
+
long Utt_Start;
|
|
298
|
+
long Utt_End;
|
|
299
|
+
long Utt_SpeechStart;
|
|
300
|
+
long Utt_SpeechEnd;
|
|
301
|
+
long Utt_Len;
|
|
302
|
+
long step;
|
|
303
|
+
long Best_ED1, Best_ED2;
|
|
304
|
+
long Best_D1, Best_D2;
|
|
305
|
+
float Best_DC1, Best_DC2;
|
|
306
|
+
long Best_BP;
|
|
307
|
+
long Largest_uttsize = 0;
|
|
308
|
+
|
|
309
|
+
Utt_id = 0;
|
|
310
|
+
while( (Utt_id < err_info-> Nutterances) &&
|
|
311
|
+
(err_info-> Nutterances < MAXNUTTERANCES) )
|
|
312
|
+
{
|
|
313
|
+
Utt_DelayEst = err_info-> Utt_DelayEst [Utt_id];
|
|
314
|
+
Utt_Delay = err_info-> Utt_Delay [Utt_id];
|
|
315
|
+
Utt_DelayConf = err_info-> Utt_DelayConf [Utt_id];
|
|
316
|
+
Utt_Start = err_info-> Utt_Start [Utt_id];
|
|
317
|
+
Utt_End = err_info-> Utt_End [Utt_id];
|
|
318
|
+
|
|
319
|
+
Utt_SpeechStart = Utt_Start;
|
|
320
|
+
while( (Utt_SpeechStart < Utt_End) && (ref_info-> VAD [Utt_SpeechStart] <= 0.0f) )
|
|
321
|
+
Utt_SpeechStart++;
|
|
322
|
+
Utt_SpeechEnd = Utt_End;
|
|
323
|
+
while( (Utt_SpeechEnd > Utt_Start) && (ref_info-> VAD [Utt_SpeechEnd] <= 0.0f) )
|
|
324
|
+
Utt_SpeechEnd--;
|
|
325
|
+
Utt_SpeechEnd++;
|
|
326
|
+
Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
|
|
327
|
+
|
|
328
|
+
if( Utt_Len >= 200 )
|
|
329
|
+
{
|
|
330
|
+
split_align( ref_info, deg_info, err_info, ftmp,
|
|
331
|
+
Utt_Start, Utt_SpeechStart, Utt_SpeechEnd, Utt_End,
|
|
332
|
+
Utt_DelayEst, Utt_DelayConf,
|
|
333
|
+
&Best_ED1, &Best_D1, &Best_DC1,
|
|
334
|
+
&Best_ED2, &Best_D2, &Best_DC2,
|
|
335
|
+
&Best_BP );
|
|
336
|
+
|
|
337
|
+
if( (Best_DC1 > Utt_DelayConf) && (Best_DC2 > Utt_DelayConf) )
|
|
338
|
+
{
|
|
339
|
+
for (step = err_info-> Nutterances-1; step > Utt_id; step-- )
|
|
340
|
+
{
|
|
341
|
+
err_info-> Utt_DelayEst [step +1] = err_info-> Utt_DelayEst [step];
|
|
342
|
+
err_info-> Utt_Delay [step +1] = err_info-> Utt_Delay [step];
|
|
343
|
+
err_info-> Utt_DelayConf [step +1] = err_info-> Utt_DelayConf [step];
|
|
344
|
+
err_info-> Utt_Start [step +1] = err_info-> Utt_Start [step];
|
|
345
|
+
err_info-> Utt_End [step +1] = err_info-> Utt_End [step];
|
|
346
|
+
err_info-> UttSearch_Start [step +1] = err_info-> Utt_Start [step];
|
|
347
|
+
err_info-> UttSearch_End [step +1] = err_info-> Utt_End [step];
|
|
348
|
+
}
|
|
349
|
+
err_info-> Nutterances++;
|
|
350
|
+
|
|
351
|
+
err_info-> Utt_DelayEst [Utt_id] = Best_ED1;
|
|
352
|
+
err_info-> Utt_Delay [Utt_id] = Best_D1;
|
|
353
|
+
err_info-> Utt_DelayConf [Utt_id] = Best_DC1;
|
|
354
|
+
|
|
355
|
+
err_info-> Utt_DelayEst [Utt_id +1] = Best_ED2;
|
|
356
|
+
err_info-> Utt_Delay [Utt_id +1] = Best_D2;
|
|
357
|
+
err_info-> Utt_DelayConf [Utt_id +1] = Best_DC2;
|
|
358
|
+
|
|
359
|
+
err_info-> UttSearch_Start [Utt_id +1] = err_info-> UttSearch_Start [Utt_id];
|
|
360
|
+
err_info-> UttSearch_End [Utt_id +1] = err_info-> UttSearch_End [Utt_id];
|
|
361
|
+
|
|
362
|
+
if( Best_D2 < Best_D1 )
|
|
363
|
+
{
|
|
364
|
+
err_info-> Utt_Start [Utt_id] = Utt_Start;
|
|
365
|
+
err_info-> Utt_End [Utt_id] = Best_BP;
|
|
366
|
+
err_info-> Utt_Start [Utt_id +1] = Best_BP;
|
|
367
|
+
err_info-> Utt_End [Utt_id +1] = Utt_End;
|
|
368
|
+
}
|
|
369
|
+
else
|
|
370
|
+
{
|
|
371
|
+
err_info-> Utt_Start [Utt_id] = Utt_Start;
|
|
372
|
+
err_info-> Utt_End [Utt_id] = Best_BP + (Best_D2 - Best_D1) / (2 * Downsample);
|
|
373
|
+
err_info-> Utt_Start [Utt_id +1] = Best_BP - (Best_D2 - Best_D1) / (2 * Downsample);
|
|
374
|
+
err_info-> Utt_End [Utt_id +1] = Utt_End;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
if( (err_info-> Utt_Start [Utt_id] - SEARCHBUFFER) * Downsample + Best_D1 < 0 )
|
|
378
|
+
err_info-> Utt_Start [Utt_id] =
|
|
379
|
+
SEARCHBUFFER + (Downsample - 1 - Best_D1) / Downsample;
|
|
380
|
+
|
|
381
|
+
if( (err_info-> Utt_End [Utt_id +1] * Downsample + Best_D2) >
|
|
382
|
+
((*deg_info).Nsamples - SEARCHBUFFER * Downsample) )
|
|
383
|
+
err_info-> Utt_End [Utt_id +1] =
|
|
384
|
+
((*deg_info).Nsamples - Best_D2) / Downsample - SEARCHBUFFER;
|
|
385
|
+
|
|
386
|
+
}
|
|
387
|
+
else Utt_id++;
|
|
388
|
+
}
|
|
389
|
+
else Utt_id++;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
for (Utt_id = 0; Utt_id < err_info-> Nutterances; Utt_id++ )
|
|
393
|
+
if( (err_info-> Utt_End [Utt_id] - err_info-> Utt_Start [Utt_id])
|
|
394
|
+
> Largest_uttsize )
|
|
395
|
+
Largest_uttsize =
|
|
396
|
+
err_info-> Utt_End [Utt_id] - err_info-> Utt_Start [Utt_id];
|
|
397
|
+
|
|
398
|
+
err_info-> Largest_uttsize = Largest_uttsize;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
void utterance_locate( SIGNAL_INFO * ref_info, SIGNAL_INFO * deg_info,
|
|
402
|
+
ERROR_INFO * err_info, float * ftmp )
|
|
403
|
+
{
|
|
404
|
+
long Utt_id;
|
|
405
|
+
|
|
406
|
+
id_searchwindows( ref_info, deg_info, err_info );
|
|
407
|
+
|
|
408
|
+
for (Utt_id = 0; Utt_id < err_info-> Nutterances; Utt_id++)
|
|
409
|
+
{
|
|
410
|
+
crude_align( ref_info, deg_info, err_info, Utt_id, ftmp);
|
|
411
|
+
time_align(ref_info, deg_info, err_info, Utt_id, ftmp );
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
id_utterances( ref_info, deg_info, err_info );
|
|
415
|
+
|
|
416
|
+
utterance_split( ref_info, deg_info, err_info, ftmp );
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
void short_term_fft (int Nf, SIGNAL_INFO *info, float *window, long start_sample, float *hz_spectrum, float *fft_tmp) {
|
|
421
|
+
int n, k;
|
|
422
|
+
|
|
423
|
+
for (n = 0; n < Nf; n++ )
|
|
424
|
+
{
|
|
425
|
+
fft_tmp [n] = info-> data [start_sample + n] * window [n];
|
|
426
|
+
}
|
|
427
|
+
RealFFT(fft_tmp, Nf);
|
|
428
|
+
|
|
429
|
+
for (k = 0; k < Nf / 2; k++ )
|
|
430
|
+
{
|
|
431
|
+
hz_spectrum [k] = fft_tmp [k << 1] * fft_tmp [k << 1] + fft_tmp [1 + (k << 1)] * fft_tmp [1 + (k << 1)];
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
hz_spectrum [0] = 0;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
void freq_warping (int number_of_hz_bands, float *hz_spectrum, int Nb, float *pitch_pow_dens, long frame) {
|
|
438
|
+
|
|
439
|
+
int hz_band = 0;
|
|
440
|
+
int bark_band;
|
|
441
|
+
double sum;
|
|
442
|
+
|
|
443
|
+
for (bark_band = 0; bark_band < Nb; bark_band++) {
|
|
444
|
+
int n = nr_of_hz_bands_per_bark_band [bark_band];
|
|
445
|
+
int i;
|
|
446
|
+
|
|
447
|
+
sum = 0;
|
|
448
|
+
for (i = 0; i < n; i++) {
|
|
449
|
+
sum += hz_spectrum [hz_band++];
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
sum *= pow_dens_correction_factor [bark_band];
|
|
453
|
+
sum *= Sp;
|
|
454
|
+
pitch_pow_dens [frame * Nb + bark_band] = (float) sum;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
float total_audible (int frame, float *pitch_pow_dens, float factor) {
|
|
459
|
+
int band;
|
|
460
|
+
float h, threshold;
|
|
461
|
+
double result;
|
|
462
|
+
|
|
463
|
+
result = 0.;
|
|
464
|
+
for (band= 1; band< Nb; band++) {
|
|
465
|
+
h = pitch_pow_dens [frame * Nb + band];
|
|
466
|
+
threshold = (float) (factor * abs_thresh_power [band]);
|
|
467
|
+
if (h > threshold) {
|
|
468
|
+
result += h;
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
return (float) result;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
void time_avg_audible_of (int number_of_frames, int *silent, float *pitch_pow_dens, float *avg_pitch_pow_dens, int total_number_of_frames)
|
|
475
|
+
{
|
|
476
|
+
int frame;
|
|
477
|
+
int band;
|
|
478
|
+
|
|
479
|
+
for (band = 0; band < Nb; band++) {
|
|
480
|
+
double result = 0;
|
|
481
|
+
for (frame = 0; frame < number_of_frames; frame++) {
|
|
482
|
+
if (!silent [frame]) {
|
|
483
|
+
float h = pitch_pow_dens [frame * Nb + band];
|
|
484
|
+
if (h > 100 * abs_thresh_power [band]) {
|
|
485
|
+
result += h;
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
avg_pitch_pow_dens [band] = (float) (result / total_number_of_frames);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
void freq_resp_compensation (int number_of_frames, float *pitch_pow_dens_ref, float *avg_pitch_pow_dens_ref, float *avg_pitch_pow_dens_deg, float constant)
|
|
495
|
+
{
|
|
496
|
+
int band;
|
|
497
|
+
|
|
498
|
+
for (band = 0; band < Nb; band++) {
|
|
499
|
+
float x = (avg_pitch_pow_dens_deg [band] + constant) / (avg_pitch_pow_dens_ref [band] + constant);
|
|
500
|
+
int frame;
|
|
501
|
+
|
|
502
|
+
if (x > (float) 100.0) {x = (float) 100.0;}
|
|
503
|
+
if (x < (float) 0.01) {x = (float) 0.01;}
|
|
504
|
+
|
|
505
|
+
for (frame = 0; frame < number_of_frames; frame++) {
|
|
506
|
+
pitch_pow_dens_ref [frame * Nb + band] *= x;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
#define ZWICKER_POWER 0.23
|
|
512
|
+
|
|
513
|
+
void intensity_warping_of (float *loudness_dens, int frame, float *pitch_pow_dens)
|
|
514
|
+
{
|
|
515
|
+
int band;
|
|
516
|
+
float h;
|
|
517
|
+
double modified_zwicker_power;
|
|
518
|
+
|
|
519
|
+
for (band = 0; band < Nb; band++) {
|
|
520
|
+
float threshold = (float) abs_thresh_power [band];
|
|
521
|
+
float input = pitch_pow_dens [frame * Nb + band];
|
|
522
|
+
|
|
523
|
+
if (centre_of_band_bark [band] < (float) 4) {
|
|
524
|
+
h = (float) 6 / ((float) centre_of_band_bark [band] + (float) 2);
|
|
525
|
+
} else {
|
|
526
|
+
h = (float) 1;
|
|
527
|
+
}
|
|
528
|
+
if (h > (float) 2) {h = (float) 2;}
|
|
529
|
+
h = (float) pow (h, (float) 0.15);
|
|
530
|
+
modified_zwicker_power = ZWICKER_POWER * h;
|
|
531
|
+
|
|
532
|
+
if (input > threshold) {
|
|
533
|
+
loudness_dens [band] = (float) (pow (threshold / 0.5, modified_zwicker_power)
|
|
534
|
+
* (pow (0.5 + 0.5 * input / threshold, modified_zwicker_power) - 1));
|
|
535
|
+
} else {
|
|
536
|
+
loudness_dens [band] = 0;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
loudness_dens [band] *= (float) Sl;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
float pseudo_Lp (int n, float *x, float p) {
|
|
544
|
+
double totalWeight = 0;
|
|
545
|
+
double result = 0;
|
|
546
|
+
int band;
|
|
547
|
+
|
|
548
|
+
for (band = 1; band < Nb; band++) {
|
|
549
|
+
float h = (float) fabs (x [band]);
|
|
550
|
+
float w = (float) width_of_band_bark [band];
|
|
551
|
+
float prod = h * w;
|
|
552
|
+
|
|
553
|
+
result += pow (prod, p);
|
|
554
|
+
totalWeight += w;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
result /= totalWeight;
|
|
558
|
+
result = pow (result, 1/p);
|
|
559
|
+
result *= totalWeight;
|
|
560
|
+
|
|
561
|
+
return (float) result;
|
|
562
|
+
}
|
|
563
|
+
void multiply_with_asymmetry_factor (float *disturbance_dens,
|
|
564
|
+
int frame,
|
|
565
|
+
const float * const pitch_pow_dens_ref,
|
|
566
|
+
const float * const pitch_pow_dens_deg)
|
|
567
|
+
{
|
|
568
|
+
int i;
|
|
569
|
+
float ratio, h;
|
|
570
|
+
|
|
571
|
+
for (i = 0; i < Nb; i++) {
|
|
572
|
+
ratio = (pitch_pow_dens_deg [frame * Nb + i] + (float) 50)
|
|
573
|
+
/ (pitch_pow_dens_ref [frame * Nb + i] + (float) 50);
|
|
574
|
+
|
|
575
|
+
h = (float) pow (ratio, (float) 1.2);
|
|
576
|
+
if (h > (float) 12) {h = (float) 12;}
|
|
577
|
+
if (h < (float) 3) {h = (float) 0.0;}
|
|
578
|
+
|
|
579
|
+
disturbance_dens [i] *= h;
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
double pow_of (const float * const x, long start_sample, long stop_sample, long divisor) {
|
|
584
|
+
long i;
|
|
585
|
+
double power = 0;
|
|
586
|
+
|
|
587
|
+
if (start_sample < 0) {
|
|
588
|
+
exit (1);
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
if (start_sample > stop_sample) {
|
|
592
|
+
exit (1);
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
for (i = start_sample; i < stop_sample; i++) {
|
|
596
|
+
float h = x [i];
|
|
597
|
+
power += h * h;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
power /= divisor;
|
|
601
|
+
return power;
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
int compute_delay (long start_sample,
|
|
606
|
+
long stop_sample,
|
|
607
|
+
long search_range,
|
|
608
|
+
float *time_series1,
|
|
609
|
+
float *time_series2,
|
|
610
|
+
float *max_correlation) {
|
|
611
|
+
|
|
612
|
+
double power1, power2, normalization;
|
|
613
|
+
long i;
|
|
614
|
+
float *x1, *x2, *y;
|
|
615
|
+
double h;
|
|
616
|
+
long n = stop_sample - start_sample;
|
|
617
|
+
long power_of_2 = nextpow2 (2 * n);
|
|
618
|
+
long best_delay;
|
|
619
|
+
|
|
620
|
+
power1 = pow_of (time_series1, start_sample, stop_sample, stop_sample - start_sample) * (double) n/(double) power_of_2;
|
|
621
|
+
power2 = pow_of (time_series2, start_sample, stop_sample, stop_sample - start_sample) * (double) n/(double) power_of_2;
|
|
622
|
+
normalization = sqrt (power1 * power2);
|
|
623
|
+
|
|
624
|
+
if ((power1 <= 1E-6) || (power2 <= 1E-6)) {
|
|
625
|
+
*max_correlation = 0;
|
|
626
|
+
return 0;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
x1 = (float *) safe_malloc ((power_of_2 + 2) * sizeof (float));;
|
|
630
|
+
x2 = (float *) safe_malloc ((power_of_2 + 2) * sizeof (float));;
|
|
631
|
+
y = (float *) safe_malloc ((power_of_2 + 2) * sizeof (float));;
|
|
632
|
+
|
|
633
|
+
for (i = 0; i < power_of_2 + 2; i++) {
|
|
634
|
+
x1 [i] = 0.;
|
|
635
|
+
x2 [i] = 0.;
|
|
636
|
+
y [i] = 0.;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
for (i = 0; i < n; i++) {
|
|
640
|
+
x1 [i] = (float) fabs (time_series1 [i + start_sample]);
|
|
641
|
+
x2 [i] = (float) fabs (time_series2 [i + start_sample]);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
RealFFT (x1, power_of_2);
|
|
645
|
+
RealFFT (x2, power_of_2);
|
|
646
|
+
|
|
647
|
+
for (i = 0; i <= power_of_2 / 2; i++) {
|
|
648
|
+
x1 [2 * i] /= power_of_2;
|
|
649
|
+
x1 [2 * i + 1] /= power_of_2;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
for (i = 0; i <= power_of_2 / 2; i++) {
|
|
653
|
+
y [2*i] = x1 [2*i] * x2 [2*i] + x1 [2*i + 1] * x2 [2*i + 1];
|
|
654
|
+
y [2*i + 1] = -x1 [2*i + 1] * x2 [2*i] + x1 [2*i] * x2 [2*i + 1];
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
RealIFFT (y, power_of_2);
|
|
658
|
+
|
|
659
|
+
best_delay = 0;
|
|
660
|
+
*max_correlation = 0;
|
|
661
|
+
|
|
662
|
+
for (i = -search_range; i <= -1; i++) {
|
|
663
|
+
h = (float) fabs (y [(i + power_of_2)]) / normalization;
|
|
664
|
+
if (fabs (h) > (double) *max_correlation) {
|
|
665
|
+
*max_correlation = (float) fabs (h);
|
|
666
|
+
best_delay= i;
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
for (i = 0; i < search_range; i++) {
|
|
671
|
+
h = (float) fabs (y [i]) / normalization;
|
|
672
|
+
if (fabs (h) > (double) *max_correlation) {
|
|
673
|
+
*max_correlation = (float) fabs (h);
|
|
674
|
+
best_delay= i;
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
safe_free (x1);
|
|
679
|
+
safe_free (x2);
|
|
680
|
+
safe_free (y);
|
|
681
|
+
|
|
682
|
+
return best_delay;
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
#define NUMBER_OF_PSQM_FRAMES_PER_SYLLABE 20
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
float Lpq_weight (int start_frame,
|
|
690
|
+
int stop_frame,
|
|
691
|
+
float power_syllable,
|
|
692
|
+
float power_time,
|
|
693
|
+
float *frame_disturbance,
|
|
694
|
+
float *time_weight) {
|
|
695
|
+
|
|
696
|
+
double result_time= 0;
|
|
697
|
+
double total_time_weight_time = 0;
|
|
698
|
+
int start_frame_of_syllable;
|
|
699
|
+
|
|
700
|
+
for (start_frame_of_syllable = start_frame;
|
|
701
|
+
start_frame_of_syllable <= stop_frame;
|
|
702
|
+
start_frame_of_syllable += NUMBER_OF_PSQM_FRAMES_PER_SYLLABE/2) {
|
|
703
|
+
|
|
704
|
+
double result_syllable = 0;
|
|
705
|
+
int count_syllable = 0;
|
|
706
|
+
int frame;
|
|
707
|
+
|
|
708
|
+
for (frame = start_frame_of_syllable;
|
|
709
|
+
frame < start_frame_of_syllable + NUMBER_OF_PSQM_FRAMES_PER_SYLLABE;
|
|
710
|
+
frame++) {
|
|
711
|
+
if (frame <= stop_frame) {
|
|
712
|
+
float h = frame_disturbance [frame];
|
|
713
|
+
result_syllable += pow (h, power_syllable);
|
|
714
|
+
}
|
|
715
|
+
count_syllable++;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
result_syllable /= count_syllable;
|
|
719
|
+
result_syllable = pow (result_syllable, (double) 1/power_syllable);
|
|
720
|
+
|
|
721
|
+
result_time+= pow (time_weight [start_frame_of_syllable - start_frame] * result_syllable, power_time);
|
|
722
|
+
total_time_weight_time += pow (time_weight [start_frame_of_syllable - start_frame], power_time);
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
result_time /= total_time_weight_time;
|
|
726
|
+
result_time= pow (result_time, (float) 1 / power_time);
|
|
727
|
+
|
|
728
|
+
return (float) result_time;
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
void set_to_sine (SIGNAL_INFO *info, float amplitude, float omega) {
|
|
732
|
+
long i;
|
|
733
|
+
|
|
734
|
+
for (i = 0; i < info-> Nsamples; i++) {
|
|
735
|
+
info-> data [i] = amplitude * (float) sin (omega * i);
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
float maximum_of (float *x, long start, long stop) {
|
|
740
|
+
long i;
|
|
741
|
+
float result = -1E20f;
|
|
742
|
+
|
|
743
|
+
for (i = start; i < stop; i++) {
|
|
744
|
+
if (result < x [i]) {
|
|
745
|
+
result = x [i];
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
return result;
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
float integral_of (float *x, long frames_after_start) {
|
|
753
|
+
double result = 0;
|
|
754
|
+
int band;
|
|
755
|
+
|
|
756
|
+
for (band = 1; band < Nb; band++) {
|
|
757
|
+
result += x [frames_after_start * Nb + band] * width_of_band_bark [band];
|
|
758
|
+
}
|
|
759
|
+
return (float) result;
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
return (float) result;
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
#define DEBUG_FR 0
|
|
766
|
+
|
|
767
|
+
void pesq_psychoacoustic_model(SIGNAL_INFO * ref_info,
|
|
768
|
+
SIGNAL_INFO * deg_info,
|
|
769
|
+
ERROR_INFO * err_info,
|
|
770
|
+
float * ftmp)
|
|
771
|
+
{
|
|
772
|
+
|
|
773
|
+
long maxNsamples = max (ref_info-> Nsamples, deg_info-> Nsamples);
|
|
774
|
+
long Nf = Downsample * 8L;
|
|
775
|
+
long start_frame, stop_frame;
|
|
776
|
+
long samples_to_skip_at_start, samples_to_skip_at_end;
|
|
777
|
+
float sum_of_5_samples;
|
|
778
|
+
long n, i;
|
|
779
|
+
float power_ref, power_deg;
|
|
780
|
+
long frame;
|
|
781
|
+
float *fft_tmp;
|
|
782
|
+
float *hz_spectrum_ref, *hz_spectrum_deg;
|
|
783
|
+
float *pitch_pow_dens_ref, *pitch_pow_dens_deg;
|
|
784
|
+
float *loudness_dens_ref, *loudness_dens_deg;
|
|
785
|
+
float *avg_pitch_pow_dens_ref, *avg_pitch_pow_dens_deg;
|
|
786
|
+
float *deadzone;
|
|
787
|
+
float *disturbance_dens, *disturbance_dens_asym_add;
|
|
788
|
+
float total_audible_pow_ref, total_audible_pow_deg;
|
|
789
|
+
int *silent;
|
|
790
|
+
float oldScale, scale;
|
|
791
|
+
int *frame_was_skipped;
|
|
792
|
+
float *frame_disturbance;
|
|
793
|
+
float *frame_disturbance_asym_add;
|
|
794
|
+
float *total_power_ref;
|
|
795
|
+
int utt;
|
|
796
|
+
|
|
797
|
+
#ifdef CALIBRATE
|
|
798
|
+
int periodInSamples;
|
|
799
|
+
int numberOfPeriodsPerFrame;
|
|
800
|
+
float omega;
|
|
801
|
+
#endif
|
|
802
|
+
|
|
803
|
+
float peak;
|
|
804
|
+
|
|
805
|
+
#define MAX_NUMBER_OF_BAD_INTERVALS 1000
|
|
806
|
+
|
|
807
|
+
int *frame_is_bad;
|
|
808
|
+
int *smeared_frame_is_bad;
|
|
809
|
+
int start_frame_of_bad_interval [MAX_NUMBER_OF_BAD_INTERVALS];
|
|
810
|
+
int stop_frame_of_bad_interval [MAX_NUMBER_OF_BAD_INTERVALS];
|
|
811
|
+
int start_sample_of_bad_interval [MAX_NUMBER_OF_BAD_INTERVALS];
|
|
812
|
+
int stop_sample_of_bad_interval [MAX_NUMBER_OF_BAD_INTERVALS];
|
|
813
|
+
int number_of_samples_in_bad_interval [MAX_NUMBER_OF_BAD_INTERVALS];
|
|
814
|
+
int delay_in_samples_in_bad_interval [MAX_NUMBER_OF_BAD_INTERVALS];
|
|
815
|
+
int number_of_bad_intervals= 0;
|
|
816
|
+
int search_range_in_samples;
|
|
817
|
+
int bad_interval;
|
|
818
|
+
float *untweaked_deg = NULL;
|
|
819
|
+
float *tweaked_deg = NULL;
|
|
820
|
+
float *doubly_tweaked_deg = NULL;
|
|
821
|
+
int there_is_a_bad_frame = FALSE;
|
|
822
|
+
float *time_weight;
|
|
823
|
+
float d_indicator, a_indicator;
|
|
824
|
+
int nn;
|
|
825
|
+
|
|
826
|
+
float Whanning [Nfmax];
|
|
827
|
+
|
|
828
|
+
for (n = 0L; n < Nf; n++ ) {
|
|
829
|
+
Whanning [n] = (float)(0.5 * (1.0 - cos((TWOPI * n) / Nf)));
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
switch (Fs) {
|
|
833
|
+
case 8000:
|
|
834
|
+
Nb = 42;
|
|
835
|
+
Sl = (float) Sl_8k;
|
|
836
|
+
Sp = (float) Sp_8k;
|
|
837
|
+
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_8k;
|
|
838
|
+
centre_of_band_bark = centre_of_band_bark_8k;
|
|
839
|
+
centre_of_band_hz = centre_of_band_hz_8k;
|
|
840
|
+
width_of_band_bark = width_of_band_bark_8k;
|
|
841
|
+
width_of_band_hz = width_of_band_hz_8k;
|
|
842
|
+
pow_dens_correction_factor = pow_dens_correction_factor_8k;
|
|
843
|
+
abs_thresh_power = abs_thresh_power_8k;
|
|
844
|
+
break;
|
|
845
|
+
case 16000:
|
|
846
|
+
Nb = 49;
|
|
847
|
+
Sl = (float) Sl_16k;
|
|
848
|
+
Sp = (float) Sp_16k;
|
|
849
|
+
nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_16k;
|
|
850
|
+
centre_of_band_bark = centre_of_band_bark_16k;
|
|
851
|
+
centre_of_band_hz = centre_of_band_hz_16k;
|
|
852
|
+
width_of_band_bark = width_of_band_bark_16k;
|
|
853
|
+
width_of_band_hz = width_of_band_hz_16k;
|
|
854
|
+
pow_dens_correction_factor = pow_dens_correction_factor_16k;
|
|
855
|
+
abs_thresh_power = abs_thresh_power_16k;
|
|
856
|
+
break;
|
|
857
|
+
default:
|
|
858
|
+
printf ("Invalid sample frequency!\n");
|
|
859
|
+
exit (1);
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
samples_to_skip_at_start = 0;
|
|
863
|
+
do {
|
|
864
|
+
sum_of_5_samples= (float) 0;
|
|
865
|
+
for (i = 0; i < 5; i++) {
|
|
866
|
+
sum_of_5_samples += (float) fabs (ref_info-> data [SEARCHBUFFER * Downsample + samples_to_skip_at_start + i]);
|
|
867
|
+
}
|
|
868
|
+
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) {
|
|
869
|
+
samples_to_skip_at_start++;
|
|
870
|
+
}
|
|
871
|
+
} while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
|
|
872
|
+
&& (samples_to_skip_at_start < maxNsamples / 2));
|
|
873
|
+
|
|
874
|
+
samples_to_skip_at_end = 0;
|
|
875
|
+
do {
|
|
876
|
+
sum_of_5_samples= (float) 0;
|
|
877
|
+
for (i = 0; i < 5; i++) {
|
|
878
|
+
sum_of_5_samples += (float) fabs (ref_info-> data [maxNsamples - SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000) - 1 - samples_to_skip_at_end - i]);
|
|
879
|
+
}
|
|
880
|
+
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) {
|
|
881
|
+
samples_to_skip_at_end++;
|
|
882
|
+
}
|
|
883
|
+
} while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
|
|
884
|
+
&& (samples_to_skip_at_end < maxNsamples / 2));
|
|
885
|
+
|
|
886
|
+
start_frame = samples_to_skip_at_start / (Nf /2);
|
|
887
|
+
stop_frame = (maxNsamples - 2 * SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000) - samples_to_skip_at_end) / (Nf /2) - 1;
|
|
888
|
+
|
|
889
|
+
power_ref = (float) pow_of (ref_info-> data,
|
|
890
|
+
SEARCHBUFFER * Downsample,
|
|
891
|
+
maxNsamples - SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000),
|
|
892
|
+
maxNsamples - 2 * SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000));
|
|
893
|
+
power_deg = (float) pow_of (deg_info-> data,
|
|
894
|
+
SEARCHBUFFER * Downsample,
|
|
895
|
+
maxNsamples - SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000),
|
|
896
|
+
maxNsamples - 2 * SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000));
|
|
897
|
+
|
|
898
|
+
fft_tmp = (float *) safe_malloc ((Nf + 2) * sizeof (float));
|
|
899
|
+
hz_spectrum_ref = (float *) safe_malloc ((Nf / 2) * sizeof (float));
|
|
900
|
+
hz_spectrum_deg = (float *) safe_malloc ((Nf / 2) * sizeof (float));
|
|
901
|
+
|
|
902
|
+
frame_is_bad = (int *) safe_malloc ((stop_frame + 1) * sizeof (int));
|
|
903
|
+
smeared_frame_is_bad=(int *) safe_malloc ((stop_frame + 1) * sizeof (int));
|
|
904
|
+
|
|
905
|
+
silent = (int *) safe_malloc ((stop_frame + 1) * sizeof (int));
|
|
906
|
+
|
|
907
|
+
pitch_pow_dens_ref = (float *) safe_malloc ((stop_frame + 1) * Nb * sizeof (float));
|
|
908
|
+
pitch_pow_dens_deg = (float *) safe_malloc ((stop_frame + 1) * Nb * sizeof (float));
|
|
909
|
+
|
|
910
|
+
frame_was_skipped = (int *) safe_malloc ((stop_frame + 1) * sizeof (int));
|
|
911
|
+
|
|
912
|
+
frame_disturbance = (float *) safe_malloc ((stop_frame + 1) * sizeof (float));
|
|
913
|
+
frame_disturbance_asym_add = (float *) safe_malloc ((stop_frame + 1) * sizeof (float));
|
|
914
|
+
|
|
915
|
+
avg_pitch_pow_dens_ref = (float *) safe_malloc (Nb * sizeof (float));
|
|
916
|
+
avg_pitch_pow_dens_deg = (float *) safe_malloc (Nb * sizeof (float));
|
|
917
|
+
loudness_dens_ref = (float *) safe_malloc (Nb * sizeof (float));
|
|
918
|
+
loudness_dens_deg = (float *) safe_malloc (Nb * sizeof (float));;
|
|
919
|
+
deadzone = (float *) safe_malloc (Nb * sizeof (float));;
|
|
920
|
+
disturbance_dens = (float *) safe_malloc (Nb * sizeof (float));
|
|
921
|
+
disturbance_dens_asym_add = (float *) safe_malloc (Nb * sizeof (float));
|
|
922
|
+
|
|
923
|
+
time_weight = (float *) safe_malloc ((stop_frame + 1) * sizeof (float));
|
|
924
|
+
total_power_ref = (float *) safe_malloc ((stop_frame + 1) * sizeof (float));
|
|
925
|
+
|
|
926
|
+
#ifdef CALIBRATE
|
|
927
|
+
periodInSamples = Fs / 1000;
|
|
928
|
+
numberOfPeriodsPerFrame = Nf / periodInSamples;
|
|
929
|
+
omega = (float) (TWOPI / periodInSamples);
|
|
930
|
+
peak;
|
|
931
|
+
|
|
932
|
+
set_to_sine (ref_info, (float) 29.54, (float) omega);
|
|
933
|
+
#endif
|
|
934
|
+
|
|
935
|
+
for (frame = 0; frame <= stop_frame; frame++) {
|
|
936
|
+
int start_sample_ref = SEARCHBUFFER * Downsample + frame * Nf / 2;
|
|
937
|
+
int start_sample_deg;
|
|
938
|
+
int delay;
|
|
939
|
+
|
|
940
|
+
short_term_fft (Nf, ref_info, Whanning, start_sample_ref, hz_spectrum_ref, fft_tmp);
|
|
941
|
+
|
|
942
|
+
if (err_info-> Nutterances < 1) {
|
|
943
|
+
printf ("Processing error!\n");
|
|
944
|
+
exit (1);
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
utt = err_info-> Nutterances - 1;
|
|
948
|
+
while ((utt >= 0) && (err_info-> Utt_Start [utt] * Downsample > start_sample_ref)) {
|
|
949
|
+
utt--;
|
|
950
|
+
}
|
|
951
|
+
if (utt >= 0) {
|
|
952
|
+
delay = err_info-> Utt_Delay [utt];
|
|
953
|
+
} else {
|
|
954
|
+
delay = err_info-> Utt_Delay [0];
|
|
955
|
+
}
|
|
956
|
+
start_sample_deg = start_sample_ref + delay;
|
|
957
|
+
|
|
958
|
+
if ((start_sample_deg > 0) && (start_sample_deg + Nf < maxNsamples + DATAPADDING_MSECS * (Fs / 1000))) {
|
|
959
|
+
short_term_fft (Nf, deg_info, Whanning, start_sample_deg, hz_spectrum_deg, fft_tmp);
|
|
960
|
+
} else {
|
|
961
|
+
for (i = 0; i < Nf / 2; i++) {
|
|
962
|
+
hz_spectrum_deg [i] = 0;
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
freq_warping (Nf / 2, hz_spectrum_ref, Nb, pitch_pow_dens_ref, frame);
|
|
967
|
+
|
|
968
|
+
peak = maximum_of (pitch_pow_dens_ref, 0, Nb);
|
|
969
|
+
|
|
970
|
+
freq_warping (Nf / 2, hz_spectrum_deg, Nb, pitch_pow_dens_deg, frame);
|
|
971
|
+
|
|
972
|
+
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1E2);
|
|
973
|
+
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1E2);
|
|
974
|
+
|
|
975
|
+
silent [frame] = (total_audible_pow_ref < 1E7);
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
time_avg_audible_of (stop_frame + 1, silent, pitch_pow_dens_ref, avg_pitch_pow_dens_ref, (maxNsamples - 2 * SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000)) / (Nf / 2) - 1);
|
|
979
|
+
time_avg_audible_of (stop_frame + 1, silent, pitch_pow_dens_deg, avg_pitch_pow_dens_deg, (maxNsamples - 2 * SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000)) / (Nf / 2) - 1);
|
|
980
|
+
|
|
981
|
+
#ifndef CALIBRATE
|
|
982
|
+
freq_resp_compensation (stop_frame + 1, pitch_pow_dens_ref, avg_pitch_pow_dens_ref, avg_pitch_pow_dens_deg, 1000);
|
|
983
|
+
#endif
|
|
984
|
+
|
|
985
|
+
oldScale = 1;
|
|
986
|
+
for (frame = 0; frame <= stop_frame; frame++) {
|
|
987
|
+
int band;
|
|
988
|
+
|
|
989
|
+
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1);
|
|
990
|
+
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1);
|
|
991
|
+
total_power_ref [frame] = total_audible_pow_ref;
|
|
992
|
+
|
|
993
|
+
scale = (total_audible_pow_ref + (float) 5E3) / (total_audible_pow_deg + (float) 5E3);
|
|
994
|
+
|
|
995
|
+
if (frame > 0) {
|
|
996
|
+
scale = (float) 0.2 * oldScale + (float) 0.8*scale;
|
|
997
|
+
}
|
|
998
|
+
oldScale = scale;
|
|
999
|
+
|
|
1000
|
+
#define MAX_SCALE 5.0
|
|
1001
|
+
|
|
1002
|
+
if (scale > (float) MAX_SCALE) scale = (float) MAX_SCALE;
|
|
1003
|
+
|
|
1004
|
+
#define MIN_SCALE 3E-4
|
|
1005
|
+
|
|
1006
|
+
if (scale < (float) MIN_SCALE) {
|
|
1007
|
+
scale = (float) MIN_SCALE;
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
for (band = 0; band < Nb; band++) {
|
|
1011
|
+
pitch_pow_dens_deg [frame * Nb + band] *= scale;
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
intensity_warping_of (loudness_dens_ref, frame, pitch_pow_dens_ref);
|
|
1015
|
+
intensity_warping_of (loudness_dens_deg, frame, pitch_pow_dens_deg);
|
|
1016
|
+
|
|
1017
|
+
for (band = 0; band < Nb; band++) {
|
|
1018
|
+
disturbance_dens [band] = loudness_dens_deg [band] - loudness_dens_ref [band];
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
for (band = 0; band < Nb; band++) {
|
|
1022
|
+
deadzone [band] = min (loudness_dens_deg [band], loudness_dens_ref [band]);
|
|
1023
|
+
deadzone [band] *= 0.25;
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
for (band = 0; band < Nb; band++) {
|
|
1027
|
+
float d = disturbance_dens [band];
|
|
1028
|
+
float m = deadzone [band];
|
|
1029
|
+
|
|
1030
|
+
if (d > m) {
|
|
1031
|
+
disturbance_dens [band] -= m;
|
|
1032
|
+
} else {
|
|
1033
|
+
if (d < -m) {
|
|
1034
|
+
disturbance_dens [band] += m;
|
|
1035
|
+
} else {
|
|
1036
|
+
disturbance_dens [band] = 0;
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
frame_disturbance [frame] = pseudo_Lp (Nb, disturbance_dens, D_POW_F);
|
|
1042
|
+
|
|
1043
|
+
#define THRESHOLD_BAD_FRAMES 30
|
|
1044
|
+
|
|
1045
|
+
if (frame_disturbance [frame] > THRESHOLD_BAD_FRAMES)
|
|
1046
|
+
{
|
|
1047
|
+
there_is_a_bad_frame = TRUE;
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
multiply_with_asymmetry_factor (disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg);
|
|
1051
|
+
|
|
1052
|
+
frame_disturbance_asym_add [frame] = pseudo_Lp (Nb, disturbance_dens, A_POW_F);
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
for (frame = 0; frame <= stop_frame; frame++) {
|
|
1056
|
+
frame_was_skipped [frame] = FALSE;
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
for (utt = 1; utt < err_info-> Nutterances; utt++) {
|
|
1060
|
+
int frame1 = (int) floor (((err_info-> Utt_Start [utt] - SEARCHBUFFER ) * Downsample + err_info-> Utt_Delay [utt]) / (Nf / 2));
|
|
1061
|
+
int j = (int) floor ((err_info-> Utt_End [utt-1] - SEARCHBUFFER) * Downsample + err_info-> Utt_Delay [utt-1]) / (Nf / 2);
|
|
1062
|
+
int delay_jump = err_info-> Utt_Delay [utt] - err_info-> Utt_Delay [utt-1];
|
|
1063
|
+
|
|
1064
|
+
if (frame1 > j) {
|
|
1065
|
+
frame1 = j;
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
if (frame1 < 0) {
|
|
1069
|
+
frame1 = 0;
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
if (delay_jump < -(int) (Nf / 2)) {
|
|
1073
|
+
|
|
1074
|
+
int frame2 = (int) ((err_info-> Utt_Start [utt] - SEARCHBUFFER) * Downsample + max (0, fabs (delay_jump))) / (Nf / 2) + 1;
|
|
1075
|
+
|
|
1076
|
+
for (frame = frame1; frame <= frame2; frame++) {
|
|
1077
|
+
if (frame < stop_frame) {
|
|
1078
|
+
frame_was_skipped [frame] = TRUE;
|
|
1079
|
+
|
|
1080
|
+
frame_disturbance [frame] = 0;
|
|
1081
|
+
frame_disturbance_asym_add [frame] = 0;
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
nn = DATAPADDING_MSECS * (Fs / 1000) + maxNsamples;
|
|
1088
|
+
|
|
1089
|
+
tweaked_deg = (float *) safe_malloc (nn * sizeof (float));
|
|
1090
|
+
|
|
1091
|
+
for (i = 0; i < nn; i++) {
|
|
1092
|
+
tweaked_deg [i] = 0;
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
for (i = SEARCHBUFFER * Downsample; i < nn - SEARCHBUFFER * Downsample; i++) {
|
|
1096
|
+
int utt = err_info-> Nutterances - 1;
|
|
1097
|
+
long delay, j;
|
|
1098
|
+
|
|
1099
|
+
while ((utt >= 0) && (err_info-> Utt_Start [utt] * Downsample > i)) {
|
|
1100
|
+
utt--;
|
|
1101
|
+
}
|
|
1102
|
+
if (utt >= 0) {
|
|
1103
|
+
delay = err_info-> Utt_Delay [utt];
|
|
1104
|
+
} else {
|
|
1105
|
+
delay = err_info-> Utt_Delay [0];
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
j = i + delay;
|
|
1109
|
+
if (j < SEARCHBUFFER * Downsample) {
|
|
1110
|
+
j = SEARCHBUFFER * Downsample;
|
|
1111
|
+
}
|
|
1112
|
+
if (j >= nn - SEARCHBUFFER * Downsample) {
|
|
1113
|
+
j = nn - SEARCHBUFFER * Downsample - 1;
|
|
1114
|
+
}
|
|
1115
|
+
tweaked_deg [i] = deg_info-> data [j];
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
if (there_is_a_bad_frame) {
|
|
1119
|
+
|
|
1120
|
+
for (frame = 0; frame <= stop_frame; frame++)
|
|
1121
|
+
{
|
|
1122
|
+
frame_is_bad [frame] = (frame_disturbance [frame] > THRESHOLD_BAD_FRAMES);
|
|
1123
|
+
|
|
1124
|
+
smeared_frame_is_bad [frame] = FALSE;
|
|
1125
|
+
}
|
|
1126
|
+
frame_is_bad [0] = FALSE;
|
|
1127
|
+
|
|
1128
|
+
#define SMEAR_RANGE 2
|
|
1129
|
+
|
|
1130
|
+
for (frame = SMEAR_RANGE; frame < stop_frame - SMEAR_RANGE; frame++) {
|
|
1131
|
+
long max_itself_and_left = frame_is_bad [frame];
|
|
1132
|
+
long max_itself_and_right = frame_is_bad [frame];
|
|
1133
|
+
long mini, i;
|
|
1134
|
+
|
|
1135
|
+
for (i = -SMEAR_RANGE; i <= 0; i++) {
|
|
1136
|
+
if (max_itself_and_left < frame_is_bad [frame + i]) {
|
|
1137
|
+
max_itself_and_left = frame_is_bad [frame + i];
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
for (i = 0; i <= SMEAR_RANGE; i++) {
|
|
1142
|
+
if (max_itself_and_right < frame_is_bad [frame + i]) {
|
|
1143
|
+
max_itself_and_right = frame_is_bad [frame + i];
|
|
1144
|
+
}
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
mini = max_itself_and_left;
|
|
1148
|
+
if (mini > max_itself_and_right) {
|
|
1149
|
+
mini = max_itself_and_right;
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
smeared_frame_is_bad [frame] = mini;
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
#define MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL 5
|
|
1156
|
+
|
|
1157
|
+
number_of_bad_intervals = 0;
|
|
1158
|
+
frame = 0;
|
|
1159
|
+
while (frame <= stop_frame) {
|
|
1160
|
+
|
|
1161
|
+
while ((frame <= stop_frame) && (!smeared_frame_is_bad [frame])) {
|
|
1162
|
+
frame++;
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
if (frame <= stop_frame) {
|
|
1166
|
+
start_frame_of_bad_interval [number_of_bad_intervals] = frame;
|
|
1167
|
+
|
|
1168
|
+
while ((frame <= stop_frame) && (smeared_frame_is_bad [frame])) {
|
|
1169
|
+
frame++;
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
if (frame <= stop_frame) {
|
|
1173
|
+
stop_frame_of_bad_interval [number_of_bad_intervals] = frame;
|
|
1174
|
+
|
|
1175
|
+
if (stop_frame_of_bad_interval [number_of_bad_intervals] - start_frame_of_bad_interval [number_of_bad_intervals] >= MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL) {
|
|
1176
|
+
number_of_bad_intervals++;
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
for (bad_interval = 0; bad_interval < number_of_bad_intervals; bad_interval++) {
|
|
1183
|
+
start_sample_of_bad_interval [bad_interval] = start_frame_of_bad_interval [bad_interval] * (Nf / 2) + SEARCHBUFFER * Downsample;
|
|
1184
|
+
stop_sample_of_bad_interval [bad_interval] = stop_frame_of_bad_interval [bad_interval] * (Nf / 2) + Nf + SEARCHBUFFER* Downsample;
|
|
1185
|
+
if (stop_frame_of_bad_interval [bad_interval] > stop_frame) {
|
|
1186
|
+
stop_frame_of_bad_interval [bad_interval] = stop_frame;
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
number_of_samples_in_bad_interval [bad_interval] = stop_sample_of_bad_interval [bad_interval] - start_sample_of_bad_interval [bad_interval];
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
|
|
1193
|
+
|
|
1194
|
+
#define SEARCH_RANGE_IN_TRANSFORM_LENGTH 4
|
|
1195
|
+
|
|
1196
|
+
search_range_in_samples= SEARCH_RANGE_IN_TRANSFORM_LENGTH * Nf;
|
|
1197
|
+
|
|
1198
|
+
for (bad_interval= 0; bad_interval< number_of_bad_intervals; bad_interval++) {
|
|
1199
|
+
float *ref = (float *) safe_malloc ( (2 * search_range_in_samples + number_of_samples_in_bad_interval [bad_interval]) * sizeof (float));
|
|
1200
|
+
float *deg = (float *) safe_malloc ( (2 * search_range_in_samples + number_of_samples_in_bad_interval [bad_interval]) * sizeof (float));
|
|
1201
|
+
int i;
|
|
1202
|
+
float best_correlation;
|
|
1203
|
+
int delay_in_samples;
|
|
1204
|
+
|
|
1205
|
+
for (i = 0; i < search_range_in_samples; i++) {
|
|
1206
|
+
ref[i] = 0.0f;
|
|
1207
|
+
}
|
|
1208
|
+
for (i = 0; i < number_of_samples_in_bad_interval [bad_interval]; i++) {
|
|
1209
|
+
ref [search_range_in_samples + i] = ref_info-> data [start_sample_of_bad_interval [bad_interval] + i];
|
|
1210
|
+
}
|
|
1211
|
+
for (i = 0; i < search_range_in_samples; i++) {
|
|
1212
|
+
ref [search_range_in_samples + number_of_samples_in_bad_interval [bad_interval] + i] = 0.0f;
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
for (i = 0;
|
|
1216
|
+
i < 2 * search_range_in_samples + number_of_samples_in_bad_interval [bad_interval];
|
|
1217
|
+
i++) {
|
|
1218
|
+
|
|
1219
|
+
int j = start_sample_of_bad_interval [bad_interval] - search_range_in_samples + i;
|
|
1220
|
+
int nn = maxNsamples - SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000);
|
|
1221
|
+
|
|
1222
|
+
if (j < SEARCHBUFFER * Downsample) {
|
|
1223
|
+
j = SEARCHBUFFER * Downsample;
|
|
1224
|
+
}
|
|
1225
|
+
if (j >= nn) {
|
|
1226
|
+
j = nn - 1;
|
|
1227
|
+
}
|
|
1228
|
+
deg [i] = tweaked_deg [j];
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
delay_in_samples= compute_delay (0,
|
|
1232
|
+
2 * search_range_in_samples + number_of_samples_in_bad_interval [bad_interval],
|
|
1233
|
+
search_range_in_samples,
|
|
1234
|
+
ref,
|
|
1235
|
+
deg,
|
|
1236
|
+
&best_correlation);
|
|
1237
|
+
|
|
1238
|
+
delay_in_samples_in_bad_interval [bad_interval] = delay_in_samples;
|
|
1239
|
+
|
|
1240
|
+
if (best_correlation < 0.5) {
|
|
1241
|
+
delay_in_samples_in_bad_interval [bad_interval] = 0;
|
|
1242
|
+
}
|
|
1243
|
+
|
|
1244
|
+
safe_free (ref);
|
|
1245
|
+
safe_free (deg);
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
if (number_of_bad_intervals > 0) {
|
|
1249
|
+
doubly_tweaked_deg = (float *) safe_malloc ((maxNsamples + DATAPADDING_MSECS * (Fs / 1000)) * sizeof (float));
|
|
1250
|
+
|
|
1251
|
+
for (i = 0; i < maxNsamples + DATAPADDING_MSECS * (Fs / 1000); i++) {
|
|
1252
|
+
doubly_tweaked_deg [i] = tweaked_deg [i];
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
for (bad_interval= 0; bad_interval< number_of_bad_intervals; bad_interval++) {
|
|
1256
|
+
int delay = delay_in_samples_in_bad_interval [bad_interval];
|
|
1257
|
+
int i;
|
|
1258
|
+
|
|
1259
|
+
for (i = start_sample_of_bad_interval [bad_interval]; i < stop_sample_of_bad_interval [bad_interval]; i++) {
|
|
1260
|
+
float h;
|
|
1261
|
+
int j = i + delay;
|
|
1262
|
+
if (j < 0) {
|
|
1263
|
+
j = 0;
|
|
1264
|
+
}
|
|
1265
|
+
if (j >= maxNsamples) {
|
|
1266
|
+
j = maxNsamples - 1;
|
|
1267
|
+
|
|
1268
|
+
}
|
|
1269
|
+
doubly_tweaked_deg [i] = h = tweaked_deg [j];
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
1272
|
+
|
|
1273
|
+
untweaked_deg = deg_info-> data;
|
|
1274
|
+
deg_info-> data = doubly_tweaked_deg;
|
|
1275
|
+
|
|
1276
|
+
for (bad_interval= 0; bad_interval < number_of_bad_intervals; bad_interval++) {
|
|
1277
|
+
|
|
1278
|
+
for (frame = start_frame_of_bad_interval [bad_interval];
|
|
1279
|
+
frame < stop_frame_of_bad_interval [bad_interval];
|
|
1280
|
+
frame++) {
|
|
1281
|
+
|
|
1282
|
+
int start_sample_ref = SEARCHBUFFER * Downsample + frame * Nf / 2;
|
|
1283
|
+
int start_sample_deg = start_sample_ref;
|
|
1284
|
+
|
|
1285
|
+
short_term_fft (Nf, deg_info, Whanning, start_sample_deg, hz_spectrum_deg, fft_tmp);
|
|
1286
|
+
|
|
1287
|
+
freq_warping (Nf / 2, hz_spectrum_deg, Nb, pitch_pow_dens_deg, frame);
|
|
1288
|
+
}
|
|
1289
|
+
|
|
1290
|
+
oldScale = 1;
|
|
1291
|
+
for (frame = start_frame_of_bad_interval [bad_interval];
|
|
1292
|
+
frame < stop_frame_of_bad_interval [bad_interval];
|
|
1293
|
+
frame++) {
|
|
1294
|
+
int band;
|
|
1295
|
+
|
|
1296
|
+
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1);
|
|
1297
|
+
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1);
|
|
1298
|
+
|
|
1299
|
+
scale = (total_audible_pow_ref + (float) 5E3) / (total_audible_pow_deg + (float) 5E3);
|
|
1300
|
+
|
|
1301
|
+
if (frame > 0) {
|
|
1302
|
+
scale = (float) 0.2 * oldScale + (float) 0.8*scale;
|
|
1303
|
+
}
|
|
1304
|
+
oldScale = scale;
|
|
1305
|
+
|
|
1306
|
+
if (scale > (float) MAX_SCALE) scale = (float) MAX_SCALE;
|
|
1307
|
+
|
|
1308
|
+
if (scale < (float) MIN_SCALE) {
|
|
1309
|
+
scale = (float) MIN_SCALE;
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
for (band = 0; band < Nb; band++) {
|
|
1313
|
+
pitch_pow_dens_deg [frame * Nb + band] *= scale;
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
intensity_warping_of (loudness_dens_ref, frame, pitch_pow_dens_ref);
|
|
1317
|
+
intensity_warping_of (loudness_dens_deg, frame, pitch_pow_dens_deg);
|
|
1318
|
+
|
|
1319
|
+
for (band = 0; band < Nb; band++) {
|
|
1320
|
+
disturbance_dens [band] = loudness_dens_deg [band] - loudness_dens_ref [band];
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
for (band = 0; band < Nb; band++) {
|
|
1324
|
+
deadzone [band] = min (loudness_dens_deg [band], loudness_dens_ref [band]);
|
|
1325
|
+
deadzone [band] *= 0.25;
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
for (band = 0; band < Nb; band++) {
|
|
1329
|
+
float d = disturbance_dens [band];
|
|
1330
|
+
float m = deadzone [band];
|
|
1331
|
+
|
|
1332
|
+
if (d > m) {
|
|
1333
|
+
disturbance_dens [band] -= m;
|
|
1334
|
+
} else {
|
|
1335
|
+
if (d < -m) {
|
|
1336
|
+
disturbance_dens [band] += m;
|
|
1337
|
+
} else {
|
|
1338
|
+
disturbance_dens [band] = 0;
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
frame_disturbance [frame] = min (frame_disturbance [frame] , pseudo_Lp (Nb, disturbance_dens, D_POW_F));
|
|
1344
|
+
|
|
1345
|
+
multiply_with_asymmetry_factor (disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg);
|
|
1346
|
+
|
|
1347
|
+
frame_disturbance_asym_add [frame] = min (frame_disturbance_asym_add [frame], pseudo_Lp (Nb, disturbance_dens, A_POW_F));
|
|
1348
|
+
}
|
|
1349
|
+
}
|
|
1350
|
+
safe_free (doubly_tweaked_deg);
|
|
1351
|
+
deg_info->data = untweaked_deg;
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
|
|
1356
|
+
for (frame = 0; frame <= stop_frame; frame++) {
|
|
1357
|
+
float h = 1;
|
|
1358
|
+
|
|
1359
|
+
if (stop_frame + 1 > 1000) {
|
|
1360
|
+
long n = (maxNsamples - 2 * SEARCHBUFFER * Downsample) / (Nf / 2) - 1;
|
|
1361
|
+
double timeWeightFactor = (n - (float) 1000) / (float) 5500;
|
|
1362
|
+
if (timeWeightFactor > (float) 0.5) timeWeightFactor = (float) 0.5;
|
|
1363
|
+
h = (float) (((float) 1.0 - timeWeightFactor) + timeWeightFactor * (float) frame / (float) n);
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
time_weight [frame] = h;
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
for (frame = 0; frame <= stop_frame; frame++) {
|
|
1370
|
+
|
|
1371
|
+
float h = (float) pow ((total_power_ref [frame] + 1E5) / 1E7, 0.04);
|
|
1372
|
+
|
|
1373
|
+
frame_disturbance [frame] /= h;
|
|
1374
|
+
frame_disturbance_asym_add [frame] /= h;
|
|
1375
|
+
|
|
1376
|
+
if (frame_disturbance [frame] > 45) {
|
|
1377
|
+
frame_disturbance [frame] = 45;
|
|
1378
|
+
}
|
|
1379
|
+
if (frame_disturbance_asym_add [frame] > 45) {
|
|
1380
|
+
frame_disturbance_asym_add [frame] = 45;
|
|
1381
|
+
}
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1384
|
+
d_indicator = Lpq_weight (start_frame, stop_frame, D_POW_S, D_POW_T, frame_disturbance, time_weight);
|
|
1385
|
+
a_indicator = Lpq_weight (start_frame, stop_frame, A_POW_S, A_POW_T, frame_disturbance_asym_add, time_weight);
|
|
1386
|
+
|
|
1387
|
+
err_info-> pesq_mos = (float) (4.5 - D_WEIGHT * d_indicator - A_WEIGHT * a_indicator);
|
|
1388
|
+
|
|
1389
|
+
FFTFree();
|
|
1390
|
+
safe_free (fft_tmp);
|
|
1391
|
+
safe_free (hz_spectrum_ref);
|
|
1392
|
+
safe_free (hz_spectrum_deg);
|
|
1393
|
+
safe_free (silent);
|
|
1394
|
+
safe_free (pitch_pow_dens_ref);
|
|
1395
|
+
safe_free (pitch_pow_dens_deg);
|
|
1396
|
+
safe_free (frame_was_skipped);
|
|
1397
|
+
safe_free (avg_pitch_pow_dens_ref);
|
|
1398
|
+
safe_free (avg_pitch_pow_dens_deg);
|
|
1399
|
+
safe_free (loudness_dens_ref);
|
|
1400
|
+
safe_free (loudness_dens_deg);
|
|
1401
|
+
safe_free (deadzone);
|
|
1402
|
+
safe_free (disturbance_dens);
|
|
1403
|
+
safe_free (disturbance_dens_asym_add);
|
|
1404
|
+
safe_free (total_power_ref);
|
|
1405
|
+
|
|
1406
|
+
safe_free (frame_is_bad);
|
|
1407
|
+
safe_free (smeared_frame_is_bad);
|
|
1408
|
+
|
|
1409
|
+
safe_free (time_weight);
|
|
1410
|
+
safe_free (frame_disturbance);
|
|
1411
|
+
safe_free (frame_disturbance_asym_add);
|
|
1412
|
+
safe_free (tweaked_deg);
|
|
1413
|
+
|
|
1414
|
+
return;
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1417
|
+
/* END OF FILE */
|