neverlib 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. neverlib/.claude/settings.local.json +9 -0
  2. neverlib/Docs/audio_aug/test_volume.ipynb +416 -0
  3. neverlib/Docs/audio_aug_test/test_volume.ipynb +289 -0
  4. neverlib/Docs/filter/biquad.ipynb +129 -0
  5. neverlib/Docs/filter/filter_family.ipynb +450 -0
  6. neverlib/Docs/filter/highpass.ipynb +139 -0
  7. neverlib/Docs/filter/scipy_filter_family.ipynb +110 -0
  8. neverlib/Docs/vad/VAD_Energy.ipynb +167 -0
  9. neverlib/Docs/vad/VAD_Silero.ipynb +325 -0
  10. neverlib/Docs/vad/VAD_WebRTC.ipynb +189 -0
  11. neverlib/Docs/vad/VAD_funasr.ipynb +192 -0
  12. neverlib/Docs/vad/VAD_rvADfast.ipynb +162 -0
  13. neverlib/Docs/vad/VAD_statistics.ipynb +532 -0
  14. neverlib/Docs/vad/VAD_tenVAD.ipynb +292 -0
  15. neverlib/Docs/vad/VAD_vadlib.ipynb +168 -0
  16. neverlib/Docs/vad/VAD_whisper.ipynb +404 -0
  17. neverlib/QA/gen_init.py +218 -0
  18. neverlib/QA/get_fun.py +19 -0
  19. neverlib/__init__.py +40 -4
  20. neverlib/audio_aug/HarmonicDistortion.py +19 -13
  21. neverlib/audio_aug/__init__.py +82 -12
  22. neverlib/audio_aug/audio_aug.py +19 -14
  23. neverlib/audio_aug/clip_aug.py +15 -18
  24. neverlib/audio_aug/coder_aug.py +44 -24
  25. neverlib/audio_aug/coder_aug2.py +54 -37
  26. neverlib/audio_aug/loss_packet_aug.py +7 -7
  27. neverlib/audio_aug/quant_aug.py +19 -17
  28. neverlib/data/000_short_enhance.wav +0 -0
  29. neverlib/data/3956_speech.wav +0 -0
  30. neverlib/data/3956_sweep.wav +0 -0
  31. neverlib/data/vad_example.wav +0 -0
  32. neverlib/data/white.wav +0 -0
  33. neverlib/data/white_EQ.wav +0 -0
  34. neverlib/data/white_matched.wav +0 -0
  35. neverlib/data_analyze/__init__.py +69 -20
  36. neverlib/data_analyze/dataset_analyzer.py +109 -114
  37. neverlib/data_analyze/quality_metrics.py +87 -89
  38. neverlib/data_analyze/rms_distrubution.py +23 -42
  39. neverlib/data_analyze/spectral_analysis.py +43 -46
  40. neverlib/data_analyze/statistics.py +76 -76
  41. neverlib/data_analyze/temporal_features.py +15 -6
  42. neverlib/data_analyze/visualization.py +208 -144
  43. neverlib/filter/__init__.py +40 -20
  44. neverlib/filter/auto_eq/__init__.py +50 -31
  45. neverlib/filter/auto_eq/de_eq.py +0 -2
  46. neverlib/filter/common.py +24 -5
  47. neverlib/metrics/DNSMOS/bak_ovr.onnx +0 -0
  48. neverlib/metrics/DNSMOS/model_v8.onnx +0 -0
  49. neverlib/metrics/DNSMOS/sig.onnx +0 -0
  50. neverlib/metrics/DNSMOS/sig_bak_ovr.onnx +0 -0
  51. neverlib/metrics/__init__.py +59 -0
  52. neverlib/metrics/dnsmos.py +4 -15
  53. neverlib/metrics/pDNSMOS/sig_bak_ovr.onnx +0 -0
  54. neverlib/metrics/pesq_c/PESQ +0 -0
  55. neverlib/metrics/pesq_c/dsp.c +553 -0
  56. neverlib/metrics/pesq_c/dsp.h +138 -0
  57. neverlib/metrics/pesq_c/pesq.h +294 -0
  58. neverlib/metrics/pesq_c/pesqdsp.c +1047 -0
  59. neverlib/metrics/pesq_c/pesqio.c +392 -0
  60. neverlib/metrics/pesq_c/pesqmain.c +610 -0
  61. neverlib/metrics/pesq_c/pesqmod.c +1417 -0
  62. neverlib/metrics/pesq_c/pesqpar.h +297 -0
  63. neverlib/metrics/snr.py +5 -1
  64. neverlib/metrics/spec.py +31 -21
  65. neverlib/metrics/test_pesq.py +0 -4
  66. neverlib/tests/__init__.py +33 -1
  67. neverlib/tests/test_imports.py +19 -0
  68. neverlib/utils/__init__.py +71 -15
  69. neverlib/utils/audio_split.py +6 -1
  70. neverlib/utils/checkGPU.py +17 -9
  71. neverlib/utils/lazy_expose.py +29 -0
  72. neverlib/utils/utils.py +55 -12
  73. neverlib/vad/PreProcess.py +66 -66
  74. neverlib/vad/__init__.py +71 -25
  75. neverlib/vad/class_get_speech.py +1 -1
  76. neverlib/vad/class_vad.py +3 -3
  77. neverlib/vad/img.png +0 -0
  78. {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/METADATA +1 -1
  79. {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/RECORD +82 -39
  80. {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/WHEEL +0 -0
  81. {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/licenses/LICENSE +0 -0
  82. {neverlib-0.2.6.dist-info → neverlib-0.2.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1047 @@
1
+ /*****************************************************************************
2
+
3
+ Perceptual Evaluation of Speech Quality (PESQ)
4
+ ITU-T Recommendation P.862.
5
+ Version 1.2 - 2 August 2002.
6
+
7
+ ****************************************
8
+ PESQ Intellectual Property Rights Notice
9
+ ****************************************
10
+
11
+ DEFINITIONS:
12
+ ------------
13
+ For the purposes of this Intellectual Property Rights Notice
14
+ the terms �Perceptual Evaluation of Speech Quality Algorithm�
15
+ and �PESQ Algorithm� refer to the objective speech quality
16
+ measurement algorithm defined in ITU-T Recommendation P.862;
17
+ the term �PESQ Software� refers to the C-code component of P.862.
18
+
19
+ NOTICE:
20
+ -------
21
+ All copyright, trade marks, trade names, patents, know-how and
22
+ all or any other intellectual rights subsisting in or used in
23
+ connection with including all algorithms, documents and manuals
24
+ relating to the PESQ Algorithm and or PESQ Software are and remain
25
+ the sole property in law, ownership, regulations, treaties and
26
+ patent rights of the Owners identified below. The user may not
27
+ dispute or question the ownership of the PESQ Algorithm and
28
+ or PESQ Software.
29
+
30
+ OWNERS ARE:
31
+ -----------
32
+
33
+ 1. British Telecommunications plc (BT), all rights assigned
34
+ to Psytechnics Limited
35
+ 2. Royal KPN NV, all rights assigned to OPTICOM GmbH
36
+
37
+ RESTRICTIONS:
38
+ -------------
39
+
40
+ The user cannot:
41
+
42
+ 1. alter, duplicate, modify, adapt, or translate in whole or in
43
+ part any aspect of the PESQ Algorithm and or PESQ Software
44
+ 2. sell, hire, loan, distribute, dispose or put to any commercial
45
+ use other than those permitted below in whole or in part any
46
+ aspect of the PESQ Algorithm and or PESQ Software
47
+
48
+ PERMITTED USE:
49
+ --------------
50
+
51
+ The user may:
52
+
53
+ 1. Use the PESQ Software to:
54
+ i) understand the PESQ Algorithm; or
55
+ ii) evaluate the ability of the PESQ Algorithm to perform
56
+ its intended function of predicting the speech quality
57
+ of a system; or
58
+ iii) evaluate the computational complexity of the PESQ Algorithm,
59
+ with the limitation that none of said evaluations or its
60
+ results shall be used for external commercial use.
61
+
62
+ 2. Use the PESQ Software to test if an implementation of the PESQ
63
+ Algorithm conforms to ITU-T Recommendation P.862.
64
+
65
+ 3. With the prior written permission of both Psytechnics Limited
66
+ and OPTICOM GmbH, use the PESQ Software in accordance with the
67
+ above Restrictions to perform work that meets all of the following
68
+ criteria:
69
+ i) the work must contribute directly to the maintenance of an
70
+ existing ITU recommendation or the development of a new ITU
71
+ recommendation under an approved ITU Study Item; and
72
+ ii) the work and its results must be fully described in a
73
+ written contribution to the ITU that is presented at a formal
74
+ ITU meeting within one year of the start of the work; and
75
+ iii) neither the work nor its results shall be put to any
76
+ commercial use other than making said contribution to the ITU.
77
+ Said permission will be provided on a case-by-case basis.
78
+
79
+
80
+ ANY OTHER USE OR APPLICATION OF THE PESQ SOFTWARE AND/OR THE PESQ
81
+ ALGORITHM WILL REQUIRE A PESQ LICENCE AGREEMENT, WHICH MAY BE OBTAINED
82
+ FROM EITHER OPTICOM GMBH OR PSYTECHNICS LIMITED.
83
+
84
+ EACH COMPANY OFFERS OEM LICENSE AGREEMENTS, WHICH COMBINE OEM
85
+ IMPLEMENTATIONS OF THE PESQ ALGORITHM TOGETHER WITH A PESQ PATENT LICENSE
86
+ AGREEMENT. PESQ PATENT-ONLY LICENSE AGREEMENTS MAY BE OBTAINED FROM OPTICOM.
87
+
88
+
89
+ ***********************************************************************
90
+ * OPTICOM GmbH * Psytechnics Limited *
91
+ * Am Weichselgarten 7, * Fraser House, 23 Museum Street, *
92
+ * D- 91058 Erlangen, Germany * Ipswich IP1 1HN, England *
93
+ * Phone: +49 (0) 9131 691 160 * Phone: +44 (0) 1473 261 800 *
94
+ * Fax: +49 (0) 9131 691 325 * Fax: +44 (0) 1473 261 880 *
95
+ * E-mail: info@opticom.de, * E-mail: info@psytechnics.com, *
96
+ * www.opticom.de * www.psytechnics.com *
97
+ ***********************************************************************
98
+
99
+ Further information is also available from www.pesq.org
100
+
101
+ *****************************************************************************/
102
+
103
+ #include <math.h>
104
+ #include <stdio.h>
105
+ #include "pesq.h"
106
+ #include "dsp.h"
107
+
108
+ void DC_block( float * data, long Nsamples )
109
+ {
110
+ float *p;
111
+ long count;
112
+ float facc = 0.0f;
113
+
114
+ long ofs = SEARCHBUFFER * Downsample;
115
+
116
+ p = data + ofs;
117
+ for( count = (Nsamples - 2 * ofs); count > 0L; count-- )
118
+ facc += *(p++);
119
+ facc /= Nsamples;
120
+
121
+ p = data + ofs;
122
+ for( count = (Nsamples - 2 * ofs); count > 0L; count-- )
123
+ *(p++) -= facc;
124
+
125
+ p = data + ofs;
126
+ for( count = 0L; count < Downsample; count++ )
127
+ *(p++) *= (0.5f + count) / Downsample;
128
+
129
+ p = data + Nsamples - ofs - 1L;
130
+ for( count = 0L; count < Downsample; count++ )
131
+ *(p--) *= (0.5f + count) / Downsample;
132
+ }
133
+
134
+ long InIIR_Nsos;
135
+ float *InIIR_Hsos;
136
+
137
+ void apply_filters( float * data, long Nsamples )
138
+ {
139
+ IIRFilt( InIIR_Hsos, InIIR_Nsos, NULL,
140
+ data, Nsamples + DATAPADDING_MSECS * (Fs / 1000), NULL );
141
+ }
142
+
143
+ float interpolate (float freq,
144
+ double filter_curve_db [][2],
145
+ int number_of_points) {
146
+ double result;
147
+ int i;
148
+ double freqLow, freqHigh;
149
+ double curveLow, curveHigh;
150
+
151
+ if (freq <= filter_curve_db [0][0]) {
152
+ freqLow = filter_curve_db [0][0];
153
+ curveLow = filter_curve_db [0][1];
154
+ freqHigh = filter_curve_db [1][0];
155
+ curveHigh = filter_curve_db [1][1];
156
+
157
+ result = ((freq - freqLow) * curveHigh + (freqHigh - freq) * curveLow)/ (freqHigh - freqLow);
158
+
159
+ return (float) result;
160
+ }
161
+
162
+ if (freq >= filter_curve_db [number_of_points-1][0]) {
163
+ freqLow = filter_curve_db [number_of_points-2][0];
164
+ curveLow = filter_curve_db [number_of_points-2][1];
165
+ freqHigh = filter_curve_db [number_of_points-1][0];
166
+ curveHigh = filter_curve_db [number_of_points-1][1];
167
+
168
+ result = ((freq - freqLow) * curveHigh + (freqHigh - freq) * curveLow)/ (freqHigh - freqLow);
169
+
170
+ return (float) result;
171
+ }
172
+
173
+ i = 1;
174
+ freqHigh = filter_curve_db [i][0];
175
+ while (freqHigh < freq) {
176
+ i++;
177
+ freqHigh = filter_curve_db [i][0];
178
+ }
179
+ curveHigh = filter_curve_db [i][1];
180
+
181
+ freqLow = filter_curve_db [i-1][0];
182
+ curveLow = filter_curve_db [i-1][1];
183
+
184
+ result = ((freq - freqLow) * curveHigh + (freqHigh - freq) * curveLow)/ (freqHigh - freqLow);
185
+
186
+ return (float) result;
187
+ }
188
+
189
+
190
+ void apply_filter ( float * data, long maxNsamples, int number_of_points, double filter_curve_db [][2] )
191
+ {
192
+ long n = maxNsamples - 2 * SEARCHBUFFER * Downsample + DATAPADDING_MSECS * (Fs / 1000);
193
+ long pow_of_2 = nextpow2 (n);
194
+ float *x = (float *) safe_malloc ((pow_of_2 + 2) * sizeof (float));
195
+
196
+ float factorDb, factor;
197
+
198
+ float overallGainFilter = interpolate ((float) 1000, filter_curve_db, number_of_points);
199
+ float freq_resolution;
200
+ int i;
201
+
202
+ for (i = 0; i < pow_of_2 + 2; i++) {
203
+ x [i] = 0;
204
+ }
205
+
206
+ for (i = 0; i < n; i++) {
207
+ x [i] = data [i + SEARCHBUFFER * Downsample];
208
+ }
209
+
210
+ RealFFT (x, pow_of_2);
211
+
212
+ freq_resolution = (float) Fs / (float) pow_of_2;
213
+
214
+
215
+ for (i = 0; i <= pow_of_2/2; i++) {
216
+ factorDb = interpolate (i * freq_resolution, filter_curve_db, number_of_points) - overallGainFilter;
217
+ factor = (float) pow ((float) 10, factorDb / (float) 20);
218
+
219
+ x [2 * i] *= factor;
220
+ x [2 * i + 1] *= factor;
221
+ }
222
+
223
+ RealIFFT (x, pow_of_2);
224
+
225
+ for (i = 0; i < n; i++) {
226
+ data [i + SEARCHBUFFER * Downsample] = x[i];
227
+ }
228
+
229
+ safe_free (x);
230
+ }
231
+
232
+ void apply_VAD( SIGNAL_INFO * pinfo, float * data, float * VAD, float * logVAD )
233
+ {
234
+ float g;
235
+ float LevelThresh;
236
+ float LevelNoise;
237
+ float StDNoise;
238
+ float LevelSig;
239
+ float LevelMin;
240
+ long count;
241
+ long iteration;
242
+ long length;
243
+ long start;
244
+ long finish;
245
+ long Nwindows = (*pinfo).Nsamples / Downsample;
246
+
247
+ for( count = 0L; count < Nwindows; count++ )
248
+ {
249
+ VAD[count] = 0.0f;
250
+ for( iteration = 0L; iteration < Downsample; iteration++ )
251
+ {
252
+ g = data[count * Downsample + iteration];
253
+ VAD[count] += (g * g);
254
+ }
255
+ VAD[count] /= Downsample;
256
+ }
257
+
258
+ LevelThresh = 0.0f;
259
+ for( count = 0L; count < Nwindows; count++ )
260
+ LevelThresh += VAD[count];
261
+ LevelThresh /= Nwindows;
262
+
263
+ LevelMin = 0.0f;
264
+ for( count = 0L; count < Nwindows; count++ )
265
+ if( VAD[count] > LevelMin )
266
+ LevelMin = VAD[count];
267
+ if( LevelMin > 0.0f )
268
+ LevelMin *= 1.0e-4f;
269
+ else
270
+ LevelMin = 1.0f;
271
+
272
+ for( count = 0L; count < Nwindows; count++ )
273
+ if( VAD[count] < LevelMin )
274
+ VAD[count] = LevelMin;
275
+
276
+ for( iteration = 0L; iteration < 12L; iteration++ )
277
+ {
278
+ LevelNoise = 0.0f;
279
+ StDNoise = 0.0f;
280
+ length = 0L;
281
+ for( count = 0L; count < Nwindows; count++ )
282
+ if( VAD[count] <= LevelThresh )
283
+ {
284
+ LevelNoise += VAD[count];
285
+ length++;
286
+ }
287
+ if( length > 0L )
288
+ {
289
+ LevelNoise /= length;
290
+ for( count = 0L; count < Nwindows; count++ )
291
+ if( VAD[count] <= LevelThresh )
292
+ {
293
+ g = VAD[count] - LevelNoise;
294
+ StDNoise += g * g;
295
+ }
296
+ StDNoise = (float)sqrt(StDNoise / length);
297
+ }
298
+
299
+ LevelThresh = 1.001f * (LevelNoise + 2.0f * StDNoise);
300
+ }
301
+
302
+ LevelNoise = 0.0f;
303
+ LevelSig = 0.0f;
304
+ length = 0L;
305
+ for( count = 0L; count < Nwindows; count++ )
306
+ {
307
+ if( VAD[count] > LevelThresh )
308
+ {
309
+ LevelSig += VAD[count];
310
+ length++;
311
+ }
312
+ else
313
+ LevelNoise += VAD[count];
314
+ }
315
+ if( length > 0L )
316
+ LevelSig /= length;
317
+ else
318
+ LevelThresh = -1.0f;
319
+ if( length < Nwindows )
320
+ LevelNoise /= (Nwindows - length);
321
+ else
322
+ LevelNoise = 1.0f;
323
+
324
+ for( count = 0L; count < Nwindows; count++ )
325
+ if( VAD[count] <= LevelThresh )
326
+ VAD[count] = -VAD[count];
327
+
328
+ VAD[0] = -LevelMin;
329
+ VAD[Nwindows-1] = -LevelMin;
330
+
331
+ start = 0L;
332
+ finish = 0L;
333
+ for( count = 1; count < Nwindows; count++ )
334
+ {
335
+ if( (VAD[count] > 0.0f) && (VAD[count-1] <= 0.0f) )
336
+ start = count;
337
+ if( (VAD[count] <= 0.0f) && (VAD[count-1] > 0.0f) )
338
+ {
339
+ finish = count;
340
+ if( (finish - start) <= MINSPEECHLGTH )
341
+ for( iteration = start; iteration < finish; iteration++ )
342
+ VAD[iteration] = -VAD[iteration];
343
+ }
344
+ }
345
+
346
+ if( LevelSig >= (LevelNoise * 1000.0f) )
347
+ {
348
+ for( count = 1; count < Nwindows; count++ )
349
+ {
350
+ if( (VAD[count] > 0.0f) && (VAD[count-1] <= 0.0f) )
351
+ start = count;
352
+ if( (VAD[count] <= 0.0f) && (VAD[count-1] > 0.0f) )
353
+ {
354
+ finish = count;
355
+ g = 0.0f;
356
+ for( iteration = start; iteration < finish; iteration++ )
357
+ g += VAD[iteration];
358
+ if( g < 3.0f * LevelThresh * (finish - start) )
359
+ for( iteration = start; iteration < finish; iteration++ )
360
+ VAD[iteration] = -VAD[iteration];
361
+ }
362
+ }
363
+ }
364
+
365
+ start = 0L;
366
+ finish = 0L;
367
+ for( count = 1; count < Nwindows; count++ )
368
+ {
369
+ if( (VAD[count] > 0.0f) && (VAD[count-1] <= 0.0f) )
370
+ {
371
+ start = count;
372
+ if( (finish > 0L) && ((start - finish) <= JOINSPEECHLGTH) )
373
+ for( iteration = finish; iteration < start; iteration++ )
374
+ VAD[iteration] = LevelMin;
375
+ }
376
+ if( (VAD[count] <= 0.0f) && (VAD[count-1] > 0.0f) )
377
+ finish = count;
378
+ }
379
+
380
+ start = 0L;
381
+ for( count = 1; count < Nwindows; count++ )
382
+ {
383
+ if( (VAD[count] > 0.0f) && (VAD[count-1] <= 0.0f) )
384
+ start = count;
385
+ }
386
+ if( start == 0L )
387
+ {
388
+ for( count = 0L; count < Nwindows; count++ )
389
+ VAD[count] = (float)fabs(VAD[count]);
390
+ VAD[0] = -LevelMin;
391
+ VAD[Nwindows-1] = -LevelMin;
392
+ }
393
+
394
+ count = 3;
395
+ while( count < (Nwindows-2) )
396
+ {
397
+ if( (VAD[count] > 0.0f) && (VAD[count-2] <= 0.0f) )
398
+ {
399
+ VAD[count-2] = VAD[count] * 0.1f;
400
+ VAD[count-1] = VAD[count] * 0.3f;
401
+ count++;
402
+ }
403
+ if( (VAD[count] <= 0.0f) && (VAD[count-1] > 0.0f) )
404
+ {
405
+ VAD[count] = VAD[count-1] * 0.3f;
406
+ VAD[count+1] = VAD[count-1] * 0.1f;
407
+ count += 3;
408
+ }
409
+ count++;
410
+ }
411
+
412
+ for( count = 0L; count < Nwindows; count++ )
413
+ if( VAD[count] < 0.0f ) VAD[count] = 0.0f;
414
+
415
+ if( LevelThresh <= 0.0f )
416
+ LevelThresh = LevelMin;
417
+ for( count = 0L; count < Nwindows; count++ )
418
+ {
419
+ if( VAD[count] <= LevelThresh )
420
+ logVAD[count] = 0.0f;
421
+ else
422
+ logVAD[count] = (float)log( VAD[count]/LevelThresh );
423
+ }
424
+ }
425
+
426
+ void crude_align(
427
+ SIGNAL_INFO * ref_info, SIGNAL_INFO * deg_info, ERROR_INFO * err_info,
428
+ long Utt_id, float * ftmp)
429
+ {
430
+ long nr;
431
+ long nd;
432
+ long startr;
433
+ long startd;
434
+ long count;
435
+ long I_max;
436
+ float max;
437
+ float * ref_VAD = (*ref_info).logVAD;
438
+ float * deg_VAD = (*deg_info).logVAD;
439
+ float * Y;
440
+
441
+ if( Utt_id == WHOLE_SIGNAL )
442
+ {
443
+ nr = (*ref_info).Nsamples / Downsample;
444
+ nd = (*deg_info).Nsamples / Downsample;
445
+ startr = 0L;
446
+ startd = 0L;
447
+ }
448
+ else if( Utt_id == MAXNUTTERANCES )
449
+ {
450
+ startr = (*err_info).UttSearch_Start[MAXNUTTERANCES-1];
451
+ startd = startr + (*err_info).Utt_DelayEst[MAXNUTTERANCES-1] / Downsample;
452
+
453
+ if ( startd < 0L )
454
+ {
455
+ startr = -(*err_info).Utt_DelayEst[MAXNUTTERANCES-1] / Downsample;
456
+ startd = 0L;
457
+ }
458
+
459
+ nr = (*err_info).UttSearch_End[MAXNUTTERANCES-1] - startr;
460
+ nd = nr;
461
+
462
+ if( startd + nd > (*deg_info).Nsamples / Downsample )
463
+ nd = (*deg_info).Nsamples / Downsample - startd;
464
+ }
465
+ else
466
+ {
467
+ startr = (*err_info).UttSearch_Start[Utt_id];
468
+ startd = startr + (*err_info).Crude_DelayEst / Downsample;
469
+
470
+ if ( startd < 0L )
471
+ {
472
+ startr = -(*err_info).Crude_DelayEst / Downsample;
473
+ startd = 0L;
474
+ }
475
+
476
+ nr = (*err_info).UttSearch_End[Utt_id] - startr;
477
+ nd = nr;
478
+
479
+ if( startd + nd > (*deg_info).Nsamples / Downsample )
480
+ nd = (*deg_info).Nsamples / Downsample - startd;
481
+ }
482
+
483
+ Y = ftmp;
484
+
485
+ if( (nr > 1L) && (nd > 1L) )
486
+ FFTNXCorr( ref_VAD + startr, nr, deg_VAD + startd, nd, Y );
487
+
488
+ max = 0.0f;
489
+ I_max = nr - 1;
490
+ if( (nr > 1L) && (nd > 1L) )
491
+ for( count = 0L; count < (nr+nd-1); count++ )
492
+ if( Y[count] > max )
493
+ {
494
+ max = Y[count];
495
+ I_max = count;
496
+ }
497
+
498
+ if( Utt_id == WHOLE_SIGNAL )
499
+ {
500
+ (*err_info).Crude_DelayEst = (I_max - nr + 1) * Downsample;
501
+ (*err_info).Crude_DelayConf = 0.0f;
502
+ }
503
+ else if( Utt_id == MAXNUTTERANCES )
504
+ {
505
+ (*err_info).Utt_Delay[MAXNUTTERANCES-1] =
506
+ (I_max - nr + 1) * Downsample + (*err_info).Utt_DelayEst[MAXNUTTERANCES-1];
507
+ }
508
+ else
509
+ {
510
+ (*err_info).Utt_DelayEst[Utt_id] =
511
+ (I_max - nr + 1) * Downsample + (*err_info).Crude_DelayEst;
512
+ }
513
+
514
+ FFTFree();
515
+ }
516
+
517
+ void time_align(
518
+ SIGNAL_INFO * ref_info, SIGNAL_INFO * deg_info, ERROR_INFO * err_info,
519
+ long Utt_id, float * ftmp )
520
+ {
521
+ long count;
522
+ long I_max;
523
+ float v_max;
524
+ long estdelay;
525
+ long startr;
526
+ long startd;
527
+ float * X1;
528
+ float * X2;
529
+ float * H;
530
+ float * Window;
531
+ float r1, i1;
532
+ long kernel;
533
+ float Hsum;
534
+
535
+ estdelay = (*err_info).Utt_DelayEst[Utt_id];
536
+
537
+ X1 = ftmp;
538
+ X2 = ftmp + Align_Nfft + 2;
539
+ H = (ftmp + 4 + 2 * Align_Nfft);
540
+ for( count = 0L; count < Align_Nfft; count++ )
541
+ H[count] = 0.0f;
542
+ Window = ftmp + 5 * Align_Nfft;
543
+
544
+ for( count = 0L; count < Align_Nfft; count++ )
545
+ Window[count] = (float)(0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)));
546
+
547
+ startr = (*err_info).UttSearch_Start[Utt_id] * Downsample;
548
+ startd = startr + estdelay;
549
+
550
+ if ( startd < 0L )
551
+ {
552
+ startr = -estdelay;
553
+ startd = 0L;
554
+ }
555
+
556
+ while( ((startd + Align_Nfft) <= (*deg_info).Nsamples) &&
557
+ ((startr + Align_Nfft) <= ((*err_info).UttSearch_End[Utt_id] * Downsample)) )
558
+ {
559
+ for( count = 0L; count < Align_Nfft; count++ )
560
+ {
561
+ X1[count] = (*ref_info).data[count + startr] * Window[count];
562
+ X2[count] = (*deg_info).data[count + startd] * Window[count];
563
+
564
+ }
565
+ RealFFT( X1, Align_Nfft );
566
+ RealFFT( X2, Align_Nfft );
567
+
568
+ for( count = 0L; count <= Align_Nfft / 2; count++ )
569
+ {
570
+ r1 = X1[count * 2]; i1 = -X1[1 + (count * 2)];
571
+ X1[count * 2] = (r1 * X2[count * 2] - i1 * X2[1 + (count * 2)]);
572
+ X1[1 + (count * 2)] = (r1 * X2[1 + (count * 2)] + i1 * X2[count * 2]);
573
+ }
574
+
575
+ RealIFFT( X1, Align_Nfft );
576
+
577
+ v_max = 0.0f;
578
+ for( count = 0L; count < Align_Nfft; count++ )
579
+ {
580
+ r1 = (float) fabs(X1[count]);
581
+ X1[count] = r1;
582
+ if( r1 > v_max ) v_max = r1;
583
+ }
584
+ v_max *= 0.99f;
585
+ for( count = 0L; count < Align_Nfft; count++ )
586
+ if( X1[count] > v_max )
587
+ H[count] += (float) pow( v_max, 0.125 );
588
+
589
+ startr += (Align_Nfft / 4);
590
+ startd += (Align_Nfft / 4);
591
+ }
592
+
593
+ Hsum = 0.0f;
594
+ for( count = 0L; count < Align_Nfft; count++ )
595
+ {
596
+ Hsum += H[count];
597
+ X1[count] = H[count];
598
+ X2[count] = 0.0f;
599
+ }
600
+
601
+ X2[0] = 1.0f;
602
+ kernel = Align_Nfft / 64;
603
+ for( count = 1; count < kernel; count++ )
604
+ {
605
+ X2[count] = 1.0f - ((float)count) / ((float)kernel);
606
+ X2[(Align_Nfft - count)] = 1.0f - ((float)count) / ((float)kernel);
607
+ }
608
+ RealFFT( X1, Align_Nfft );
609
+ RealFFT( X2, Align_Nfft );
610
+
611
+ for( count = 0L; count <= Align_Nfft / 2; count++ )
612
+ {
613
+ r1 = X1[count * 2]; i1 = X1[1 + (count * 2)];
614
+ X1[count * 2] = (r1 * X2[count * 2] - i1 * X2[1 + (count * 2)]);
615
+ X1[1 + (count * 2)] = (r1 * X2[1 + (count * 2)] + i1 * X2[count * 2]);
616
+ }
617
+ RealIFFT( X1, Align_Nfft );
618
+
619
+ for( count = 0L; count < Align_Nfft; count++ )
620
+ {
621
+ if( Hsum > 0.0 )
622
+ H[count] = (float) fabs(X1[count]) / Hsum;
623
+ else
624
+ H[count] = 0.0f;
625
+ }
626
+
627
+ v_max = 0.0f;
628
+ I_max = 0L;
629
+ for( count = 0L; count < Align_Nfft; count++ )
630
+ if( H[count] > v_max )
631
+ {
632
+ v_max = H[count];
633
+ I_max = count;
634
+ }
635
+ if( I_max >= (Align_Nfft/2) )
636
+ I_max -= Align_Nfft;
637
+
638
+ (*err_info).Utt_Delay[Utt_id] = estdelay + I_max;
639
+ (*err_info).Utt_DelayConf[Utt_id] = v_max;
640
+
641
+ FFTFree();
642
+ }
643
+
644
+ void split_align( SIGNAL_INFO * ref_info, SIGNAL_INFO * deg_info,
645
+ ERROR_INFO * err_info, float * ftmp,
646
+ long Utt_Start, long Utt_SpeechStart, long Utt_SpeechEnd, long Utt_End,
647
+ long Utt_DelayEst, float Utt_DelayConf,
648
+ long * Best_ED1, long * Best_D1, float * Best_DC1,
649
+ long * Best_ED2, long * Best_D2, float * Best_DC2,
650
+ long * Best_BP )
651
+ {
652
+ long count, bp, k;
653
+ long Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
654
+ long Utt_Test = MAXNUTTERANCES - 1;
655
+
656
+ long N_BPs;
657
+ long Utt_BPs[41];
658
+ long Utt_ED1[41], Utt_ED2[41];
659
+ long Utt_D1[41], Utt_D2[41];
660
+ float Utt_DC1[41], Utt_DC2[41];
661
+
662
+ long Delta, Step, Pad;
663
+
664
+ long estdelay;
665
+ long I_max;
666
+ float v_max, n_max;
667
+ long startr;
668
+ long startd;
669
+ float * X1;
670
+ float * X2;
671
+ float * H;
672
+ float * Window;
673
+ float r1, i1;
674
+ long kernel;
675
+ float Hsum;
676
+
677
+ *Best_DC1 = 0.0f;
678
+ *Best_DC2 = 0.0f;
679
+
680
+ X1 = ftmp;
681
+ X2 = ftmp + 2 + Align_Nfft;
682
+ H = (ftmp + 4 + 2 * Align_Nfft);
683
+ Window = ftmp + 6 + 3 * Align_Nfft;
684
+ for( count = 0L; count < Align_Nfft; count++ )
685
+ Window[count] = (float)(0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)));
686
+ kernel = Align_Nfft / 64;
687
+
688
+ Delta = Align_Nfft / (4 * Downsample);
689
+
690
+ Step = (long) ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta));
691
+ Step *= Delta;
692
+
693
+ Pad = Utt_Len / 10;
694
+ if( Pad < 75 ) Pad = 75;
695
+ Utt_BPs[0] = Utt_SpeechStart + Pad;
696
+ N_BPs = 0;
697
+ do {
698
+ N_BPs++;
699
+ Utt_BPs[N_BPs] = Utt_BPs[N_BPs-1] + Step;
700
+ } while( (Utt_BPs[N_BPs] <= (Utt_SpeechEnd - Pad)) && (N_BPs < 40) );
701
+
702
+ if( N_BPs <= 0 ) return;
703
+
704
+ for( bp = 0; bp < N_BPs; bp++ )
705
+ {
706
+ (*err_info).Utt_DelayEst[Utt_Test] = Utt_DelayEst;
707
+ (*err_info).UttSearch_Start[Utt_Test] = Utt_Start;
708
+ (*err_info).UttSearch_End[Utt_Test] = Utt_BPs[bp];
709
+
710
+ crude_align( ref_info, deg_info, err_info, MAXNUTTERANCES, ftmp);
711
+ Utt_ED1[bp] = (*err_info).Utt_Delay[Utt_Test];
712
+
713
+ (*err_info).Utt_DelayEst[Utt_Test] = Utt_DelayEst;
714
+ (*err_info).UttSearch_Start[Utt_Test] = Utt_BPs[bp];
715
+ (*err_info).UttSearch_End[Utt_Test] = Utt_End;
716
+
717
+ crude_align( ref_info, deg_info, err_info, MAXNUTTERANCES, ftmp);
718
+ Utt_ED2[bp] = (*err_info).Utt_Delay[Utt_Test];
719
+ }
720
+
721
+ for( bp = 0; bp < N_BPs; bp++ )
722
+ Utt_DC1[bp] = -2.0f;
723
+ while( 1 )
724
+ {
725
+ bp = 0;
726
+ while( (bp < N_BPs) && (Utt_DC1[bp] > -2.0) )
727
+ bp++;
728
+ if( bp >= N_BPs )
729
+ break;
730
+
731
+ estdelay = Utt_ED1[bp];
732
+
733
+ for( count = 0L; count < Align_Nfft; count++ )
734
+ H[count] = 0.0f;
735
+ Hsum = 0.0f;
736
+
737
+ startr = Utt_Start * Downsample;
738
+ startd = startr + estdelay;
739
+
740
+ if ( startd < 0L )
741
+ {
742
+ startr = -estdelay;
743
+ startd = 0L;
744
+ }
745
+
746
+ while( ((startd + Align_Nfft) <= (*deg_info).Nsamples) &&
747
+ ((startr + Align_Nfft) <= (Utt_BPs[bp] * Downsample)) )
748
+ {
749
+ for( count = 0L; count < Align_Nfft; count++ )
750
+ {
751
+ X1[count] = (*ref_info).data[count + startr] * Window[count];
752
+ X2[count] = (*deg_info).data[count + startd] * Window[count];
753
+ }
754
+ RealFFT( X1, Align_Nfft );
755
+ RealFFT( X2, Align_Nfft );
756
+
757
+ for( count = 0L; count <= Align_Nfft / 2; count++ )
758
+ {
759
+ r1 = X1[count * 2]; i1 = -X1[1 + (count * 2)];
760
+ X1[count * 2] = (r1 * X2[count * 2] - i1 * X2[1 + (count * 2)]);
761
+ X1[1 + (count * 2)] = (r1 * X2[1 + (count * 2)] + i1 * X2[count * 2]);
762
+ }
763
+
764
+ RealIFFT( X1, Align_Nfft );
765
+
766
+ v_max = 0.0f;
767
+ for( count = 0L; count < Align_Nfft; count++ )
768
+ {
769
+ r1 = (float) fabs(X1[count]);
770
+ X1[count] = r1;
771
+ if( r1 > v_max ) v_max = r1;
772
+ }
773
+ v_max *= 0.99f;
774
+ n_max = (float) pow( v_max, 0.125 ) / kernel;
775
+
776
+ for( count = 0L; count < Align_Nfft; count++ )
777
+ if( X1[count] > v_max )
778
+ {
779
+ Hsum += n_max * kernel;
780
+ for( k = 1-kernel; k < kernel; k++ )
781
+ H[(count + k + Align_Nfft) % Align_Nfft] +=
782
+ n_max * (kernel - (float) fabs(k));
783
+ }
784
+
785
+ startr += (Align_Nfft / 4);
786
+ startd += (Align_Nfft / 4);
787
+ }
788
+
789
+ v_max = 0.0f;
790
+ I_max = 0L;
791
+ for( count = 0L; count < Align_Nfft; count++ )
792
+ if( H[count] > v_max )
793
+ {
794
+ v_max = H[count];
795
+ I_max = count;
796
+ }
797
+ if( I_max >= (Align_Nfft/2) )
798
+ I_max -= Align_Nfft;
799
+
800
+ Utt_D1[bp] = estdelay + I_max;
801
+ if( Hsum > 0.0 )
802
+ Utt_DC1[bp] = v_max / Hsum;
803
+ else
804
+ Utt_DC1[bp] = 0.0f;
805
+
806
+ while( bp < (N_BPs - 1) )
807
+ {
808
+ bp++;
809
+ if( (Utt_ED1[bp] == estdelay) && (Utt_DC1[bp] <= -2.0) )
810
+ {
811
+ while( ((startd + Align_Nfft) <= (*deg_info).Nsamples) &&
812
+ ((startr + Align_Nfft) <= (Utt_BPs[bp] * Downsample)) )
813
+ {
814
+ for( count = 0L; count < Align_Nfft; count++ )
815
+ {
816
+ X1[count] = (*ref_info).data[count + startr] * Window[count];
817
+ X2[count] = (*deg_info).data[count + startd] * Window[count];
818
+ }
819
+ RealFFT( X1, Align_Nfft );
820
+ RealFFT( X2, Align_Nfft );
821
+
822
+ for( count = 0L; count <= Align_Nfft/2; count++ )
823
+ {
824
+ r1 = X1[count * 2]; i1 = -X1[1 + (count * 2)];
825
+ X1[count * 2] = (r1 * X2[count * 2] - i1 * X2[1 + (count * 2)]);
826
+ X1[1 + (count * 2)] = (r1 * X2[1 + (count * 2)] + i1 * X2[count * 2]);
827
+ }
828
+
829
+ RealIFFT( X1, Align_Nfft );
830
+
831
+ v_max = 0.0f;
832
+ for( count = 0L; count < Align_Nfft; count++ )
833
+ {
834
+ r1 = (float) fabs(X1[count]);
835
+ X1[count] = r1;
836
+ if( r1 > v_max ) v_max = r1;
837
+ }
838
+ v_max *= 0.99f;
839
+ n_max = (float) pow( v_max, 0.125 ) / kernel;
840
+
841
+ for( count = 0L; count < Align_Nfft; count++ )
842
+ if( X1[count] > v_max )
843
+ {
844
+ Hsum += n_max * kernel;
845
+ for( k = 1-kernel; k < kernel; k++ )
846
+ H[(count + k + Align_Nfft) % Align_Nfft] +=
847
+ n_max * (kernel - (float) fabs(k));
848
+ }
849
+
850
+ startr += (Align_Nfft / 4);
851
+ startd += (Align_Nfft / 4);
852
+ }
853
+
854
+ v_max = 0.0f;
855
+ I_max = 0L;
856
+ for( count = 0L; count < Align_Nfft; count++ )
857
+ if( H[count] > v_max )
858
+ {
859
+ v_max = H[count];
860
+ I_max = count;
861
+ }
862
+ if( I_max >= (Align_Nfft/2) )
863
+ I_max -= Align_Nfft;
864
+
865
+ Utt_D1[bp] = estdelay + I_max;
866
+ if( Hsum > 0.0 )
867
+ Utt_DC1[bp] = v_max / Hsum;
868
+ else
869
+ Utt_DC1[bp] = 0.0f;
870
+ }
871
+ }
872
+ }
873
+
874
+ for( bp = 0; bp < N_BPs; bp++ )
875
+ {
876
+ if( Utt_DC1[bp] > Utt_DelayConf )
877
+ Utt_DC2[bp] = -2.0f;
878
+ else
879
+ Utt_DC2[bp] = 0.0f;
880
+ }
881
+ while( 1 )
882
+ {
883
+ bp = N_BPs - 1;
884
+ while( (bp >= 0) && (Utt_DC2[bp] > -2.0) )
885
+ bp--;
886
+ if( bp < 0 )
887
+ break;
888
+
889
+ estdelay = Utt_ED2[bp];
890
+
891
+ for( count = 0L; count < Align_Nfft; count++ )
892
+ H[count] = 0.0f;
893
+ Hsum = 0.0f;
894
+
895
+ startr = Utt_End * Downsample - Align_Nfft;
896
+ startd = startr + estdelay;
897
+
898
+ if ( (startd + Align_Nfft) > (*deg_info).Nsamples )
899
+ {
900
+ startd = (*deg_info).Nsamples - Align_Nfft;
901
+ startr = startd - estdelay;
902
+ }
903
+
904
+ while( (startd >= 0L) &&
905
+ (startr >= (Utt_BPs[bp] * Downsample)) )
906
+ {
907
+ for( count = 0L; count < Align_Nfft; count++ )
908
+ {
909
+ X1[count] = (*ref_info).data[count + startr] * Window[count];
910
+ X2[count] = (*deg_info).data[count + startd] * Window[count];
911
+ }
912
+ RealFFT( X1, Align_Nfft );
913
+ RealFFT( X2, Align_Nfft );
914
+
915
+ for( count = 0L; count <= Align_Nfft/2; count++ )
916
+ {
917
+ r1 = X1[count * 2]; i1 = -X1[1 + (count * 2)];
918
+ X1[count * 2] = (r1 * X2[count * 2] - i1 * X2[1 + (count * 2)]);
919
+ X1[1 + (count * 2)] = (r1 * X2[1 + (count * 2)] + i1 * X2[count * 2]);
920
+ }
921
+
922
+ RealIFFT( X1, Align_Nfft );
923
+
924
+ v_max = 0.0f;
925
+ for( count = 0L; count < Align_Nfft; count++ )
926
+ {
927
+ r1 = (float) fabs(X1[count]);
928
+ X1[count] = r1;
929
+ if( r1 > v_max ) v_max = r1;
930
+ }
931
+ v_max *= 0.99f;
932
+ n_max = (float) pow( v_max, 0.125 ) / kernel;
933
+
934
+ for( count = 0L; count < Align_Nfft; count++ )
935
+ if( X1[count] > v_max )
936
+ {
937
+ Hsum += n_max * kernel;
938
+ for( k = 1-kernel; k < kernel; k++ )
939
+ H[(count + k + Align_Nfft) % Align_Nfft] +=
940
+ n_max * (kernel - (float) fabs(k));
941
+ }
942
+
943
+ startr -= (Align_Nfft / 4);
944
+ startd -= (Align_Nfft / 4);
945
+ }
946
+
947
+ v_max = 0.0f;
948
+ I_max = 0L;
949
+ for( count = 0L; count < Align_Nfft; count++ )
950
+ if( H[count] > v_max )
951
+ {
952
+ v_max = H[count];
953
+ I_max = count;
954
+ }
955
+ if( I_max >= (Align_Nfft/2) )
956
+ I_max -= Align_Nfft;
957
+
958
+ Utt_D2[bp] = estdelay + I_max;
959
+ if( Hsum > 0.0 )
960
+ Utt_DC2[bp] = v_max / Hsum;
961
+ else
962
+ Utt_DC2[bp] = 0.0f;
963
+
964
+ while( bp > 0 )
965
+ {
966
+ bp--;
967
+ if( (Utt_ED2[bp] == estdelay) && (Utt_DC2[bp] <= -2.0) )
968
+ {
969
+ while( (startd >= 0L) &&
970
+ (startr >= (Utt_BPs[bp] * Downsample)) )
971
+ {
972
+ for( count = 0L; count < Align_Nfft; count++ )
973
+ {
974
+ X1[count] = (*ref_info).data[count + startr] * Window[count];
975
+ X2[count] = (*deg_info).data[count + startd] * Window[count];
976
+ }
977
+ RealFFT( X1, Align_Nfft );
978
+ RealFFT( X2, Align_Nfft );
979
+
980
+ for( count = 0L; count <= Align_Nfft / 2; count++ )
981
+ {
982
+ r1 = X1[count * 2]; i1 = -X1[1 + (count * 2)];
983
+ X1[count * 2] = (r1 * X2[count * 2] - i1 * X2[1 + (count * 2)]);
984
+ X1[1 + (count * 2)] = (r1 * X2[1 + (count * 2)] + i1 * X2[count * 2]);
985
+ }
986
+
987
+ RealIFFT( X1, Align_Nfft );
988
+
989
+ v_max = 0.0f;
990
+ for( count = 0L; count < Align_Nfft; count++ )
991
+ {
992
+ r1 = (float) fabs(X1[count]);
993
+ X1[count] = r1;
994
+ if( r1 > v_max ) v_max = r1;
995
+ }
996
+ v_max *= 0.99f;
997
+ n_max = (float) pow( v_max, 0.125 ) / kernel;
998
+
999
+ for( count = 0L; count < Align_Nfft; count++ )
1000
+ if( X1[count] > v_max )
1001
+ {
1002
+ Hsum += n_max * kernel;
1003
+ for( k = 1-kernel; k < kernel; k++ )
1004
+ H[(count + k + Align_Nfft) % Align_Nfft] +=
1005
+ n_max * (kernel - (float) fabs(k));
1006
+ }
1007
+
1008
+ startr -= (Align_Nfft / 4);
1009
+ startd -= (Align_Nfft / 4);
1010
+ }
1011
+
1012
+ v_max = 0.0f;
1013
+ I_max = 0L;
1014
+ for( count = 0L; count < Align_Nfft; count++ )
1015
+ if( H[count] > v_max )
1016
+ {
1017
+ v_max = H[count];
1018
+ I_max = count;
1019
+ }
1020
+ if( I_max >= (Align_Nfft/2) )
1021
+ I_max -= Align_Nfft;
1022
+
1023
+ Utt_D2[bp] = estdelay + I_max;
1024
+ if( Hsum > 0.0 )
1025
+ Utt_DC2[bp] = v_max / Hsum;
1026
+ else
1027
+ Utt_DC2[bp] = 0.0f;
1028
+ }
1029
+ }
1030
+ }
1031
+
1032
+ for( bp = 0; bp < N_BPs; bp++ )
1033
+ {
1034
+ if( (abs(Utt_D2[bp] - Utt_D1[bp]) >= Downsample) &&
1035
+ ((Utt_DC1[bp] + Utt_DC2[bp]) > ((*Best_DC1) + (*Best_DC2))) &&
1036
+ (Utt_DC1[bp] > Utt_DelayConf) && (Utt_DC2[bp] > Utt_DelayConf) )
1037
+ {
1038
+ *Best_ED1 = Utt_ED1[bp]; *Best_D1 = Utt_D1[bp]; *Best_DC1 = Utt_DC1[bp];
1039
+ *Best_ED2 = Utt_ED2[bp]; *Best_D2 = Utt_D2[bp]; *Best_DC2 = Utt_DC2[bp];
1040
+ *Best_BP = Utt_BPs[bp];
1041
+ }
1042
+ }
1043
+
1044
+ FFTFree();
1045
+ }
1046
+
1047
+ /* END OF FILE */