webrtcvad 0.1.0 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/ext/webrtcvad/extconf.rb +29 -0
  3. data/ext/webrtcvad/webrtc/common_audio/signal_processing/division_operations.c +141 -0
  4. data/ext/webrtcvad/webrtc/common_audio/signal_processing/dot_product_with_scale.h +40 -0
  5. data/ext/webrtcvad/webrtc/common_audio/signal_processing/energy.c +39 -0
  6. data/ext/webrtcvad/webrtc/common_audio/signal_processing/get_scaling_square.c +46 -0
  7. data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/signal_processing_library.h +1605 -0
  8. data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/spl_inl.h +153 -0
  9. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_48khz.c +186 -0
  10. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.c +689 -0
  11. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.h +60 -0
  12. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_fractional.c +239 -0
  13. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c +77 -0
  14. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h +29 -0
  15. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_mips.c +207 -0
  16. data/ext/webrtcvad/webrtc/common_audio/vad/include/webrtc_vad.h +87 -0
  17. data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.c +685 -0
  18. data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.h +114 -0
  19. data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.c +329 -0
  20. data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.h +45 -0
  21. data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.c +82 -0
  22. data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.h +39 -0
  23. data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.c +176 -0
  24. data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.h +54 -0
  25. data/ext/webrtcvad/webrtc/common_audio/vad/webrtc_vad.c +114 -0
  26. data/ext/webrtcvad/webrtc/rtc_base/checks.cc +207 -0
  27. data/ext/webrtcvad/webrtc/rtc_base/checks.h +400 -0
  28. data/ext/webrtcvad/webrtc/rtc_base/compile_assert_c.h +25 -0
  29. data/ext/webrtcvad/webrtc/rtc_base/numerics/safe_compare.h +176 -0
  30. data/ext/webrtcvad/webrtc/rtc_base/sanitizer.h +144 -0
  31. data/ext/webrtcvad/webrtc/rtc_base/system/inline.h +31 -0
  32. data/ext/webrtcvad/webrtc/rtc_base/system/rtc_export.h +43 -0
  33. data/ext/webrtcvad/webrtc/rtc_base/type_traits.h +140 -0
  34. data/ext/webrtcvad/webrtcvad.c +112 -0
  35. metadata +37 -3
@@ -0,0 +1,87 @@
1
+ /*
2
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license
5
+ * that can be found in the LICENSE file in the root of the source
6
+ * tree. An additional intellectual property rights grant can be found
7
+ * in the file PATENTS. All contributing project authors may
8
+ * be found in the AUTHORS file in the root of the source tree.
9
+ */
10
+
11
+ /*
12
+ * This header file includes the VAD API calls. Specific function calls are
13
+ * given below.
14
+ */
15
+
16
+ #ifndef COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
17
+ #define COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_
18
+
19
+ #include <stddef.h>
20
+ #include <stdint.h>
21
+
22
+ typedef struct WebRtcVadInst VadInst;
23
+
24
+ #ifdef __cplusplus
25
+ extern "C" {
26
+ #endif
27
+
28
+ // Creates an instance to the VAD structure.
29
+ VadInst* WebRtcVad_Create(void);
30
+
31
+ // Frees the dynamic memory of a specified VAD instance.
32
+ //
33
+ // - handle [i] : Pointer to VAD instance that should be freed.
34
+ void WebRtcVad_Free(VadInst* handle);
35
+
36
+ // Initializes a VAD instance.
37
+ //
38
+ // - handle [i/o] : Instance that should be initialized.
39
+ //
40
+ // returns : 0 - (OK),
41
+ // -1 - (null pointer or Default mode could not be set).
42
+ int WebRtcVad_Init(VadInst* handle);
43
+
44
+ // Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
45
+ // restrictive in reporting speech. Put in other words the probability of being
46
+ // speech when the VAD returns 1 is increased with increasing mode. As a
47
+ // consequence also the missed detection rate goes up.
48
+ //
49
+ // - handle [i/o] : VAD instance.
50
+ // - mode [i] : Aggressiveness mode (0, 1, 2, or 3).
51
+ //
52
+ // returns : 0 - (OK),
53
+ // -1 - (null pointer, mode could not be set or the VAD instance
54
+ // has not been initialized).
55
+ int WebRtcVad_set_mode(VadInst* handle, int mode);
56
+
57
+ // Calculates a VAD decision for the |audio_frame|. For valid sampling rates
58
+ // frame lengths, see the description of WebRtcVad_ValidRatesAndFrameLengths().
59
+ //
60
+ // - handle [i/o] : VAD Instance. Needs to be initialized by
61
+ // WebRtcVad_Init() before call.
62
+ // - fs [i] : Sampling frequency (Hz): 8000, 16000, or 32000
63
+ // - audio_frame [i] : Audio frame buffer.
64
+ // - frame_length [i] : Length of audio frame buffer in number of samples.
65
+ //
66
+ // returns : 1 - (Active Voice),
67
+ // 0 - (Non-active Voice),
68
+ // -1 - (Error)
69
+ int WebRtcVad_Process(VadInst* handle,
70
+ int fs,
71
+ const int16_t* audio_frame,
72
+ size_t frame_length);
73
+
74
+ // Checks for valid combinations of |rate| and |frame_length|. We support 10,
75
+ // 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz.
76
+ //
77
+ // - rate [i] : Sampling frequency (Hz).
78
+ // - frame_length [i] : Speech frame buffer length in number of samples.
79
+ //
80
+ // returns : 0 - (valid combination), -1 - (invalid combination)
81
+ int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length);
82
+
83
+ #ifdef __cplusplus
84
+ }
85
+ #endif
86
+
87
+ #endif // COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
@@ -0,0 +1,685 @@
1
+ /*
2
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license
5
+ * that can be found in the LICENSE file in the root of the source
6
+ * tree. An additional intellectual property rights grant can be found
7
+ * in the file PATENTS. All contributing project authors may
8
+ * be found in the AUTHORS file in the root of the source tree.
9
+ */
10
+
11
+ #include "common_audio/vad/vad_core.h"
12
+
13
+ #include "rtc_base/sanitizer.h"
14
+ #include "common_audio/signal_processing/include/signal_processing_library.h"
15
+ #include "common_audio/vad/vad_filterbank.h"
16
+ #include "common_audio/vad/vad_gmm.h"
17
+ #include "common_audio/vad/vad_sp.h"
18
+
19
+ // Spectrum Weighting
20
+ static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
21
+ static const int16_t kNoiseUpdateConst = 655; // Q15
22
+ static const int16_t kSpeechUpdateConst = 6554; // Q15
23
+ static const int16_t kBackEta = 154; // Q8
24
+ // Minimum difference between the two models, Q5
25
+ static const int16_t kMinimumDifference[kNumChannels] = {
26
+ 544, 544, 576, 576, 576, 576 };
27
+ // Upper limit of mean value for speech model, Q7
28
+ static const int16_t kMaximumSpeech[kNumChannels] = {
29
+ 11392, 11392, 11520, 11520, 11520, 11520 };
30
+ // Minimum value for mean value
31
+ static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
32
+ // Upper limit of mean value for noise model, Q7
33
+ static const int16_t kMaximumNoise[kNumChannels] = {
34
+ 9216, 9088, 8960, 8832, 8704, 8576 };
35
+ // Start values for the Gaussian models, Q7
36
+ // Weights for the two Gaussians for the six channels (noise)
37
+ static const int16_t kNoiseDataWeights[kTableSize] = {
38
+ 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
39
+ // Weights for the two Gaussians for the six channels (speech)
40
+ static const int16_t kSpeechDataWeights[kTableSize] = {
41
+ 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
42
+ // Means for the two Gaussians for the six channels (noise)
43
+ static const int16_t kNoiseDataMeans[kTableSize] = {
44
+ 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
45
+ // Means for the two Gaussians for the six channels (speech)
46
+ static const int16_t kSpeechDataMeans[kTableSize] = {
47
+ 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
48
+ };
49
+ // Stds for the two Gaussians for the six channels (noise)
50
+ static const int16_t kNoiseDataStds[kTableSize] = {
51
+ 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
52
+ // Stds for the two Gaussians for the six channels (speech)
53
+ static const int16_t kSpeechDataStds[kTableSize] = {
54
+ 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
55
+
56
+ // Constants used in GmmProbability().
57
+ //
58
+ // Maximum number of counted speech (VAD = 1) frames in a row.
59
+ static const int16_t kMaxSpeechFrames = 6;
60
+ // Minimum standard deviation for both speech and noise.
61
+ static const int16_t kMinStd = 384;
62
+
63
+ // Constants in WebRtcVad_InitCore().
64
+ // Default aggressiveness mode.
65
+ static const short kDefaultMode = 0;
66
+ static const int kInitCheck = 42;
67
+
68
+ // Constants used in WebRtcVad_set_mode_core().
69
+ //
70
+ // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
71
+ //
72
+ // Mode 0, Quality.
73
+ static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
74
+ static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
75
+ static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
76
+ static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
77
+ // Mode 1, Low bitrate.
78
+ static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
79
+ static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
80
+ static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
81
+ static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
82
+ // Mode 2, Aggressive.
83
+ static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
84
+ static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
85
+ static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
86
+ static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
87
+ // Mode 3, Very aggressive.
88
+ static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
89
+ static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
90
+ static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
91
+ static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
92
+
93
+ // Calculates the weighted average w.r.t. number of Gaussians. The |data| are
94
+ // updated with an |offset| before averaging.
95
+ //
96
+ // - data [i/o] : Data to average.
97
+ // - offset [i] : An offset added to |data|.
98
+ // - weights [i] : Weights used for averaging.
99
+ //
100
+ // returns : The weighted average.
101
+ static int32_t WeightedAverage(int16_t* data, int16_t offset,
102
+ const int16_t* weights) {
103
+ int k;
104
+ int32_t weighted_average = 0;
105
+
106
+ for (k = 0; k < kNumGaussians; k++) {
107
+ data[k * kNumChannels] += offset;
108
+ weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
109
+ }
110
+ return weighted_average;
111
+ }
112
+
113
+ // An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
114
+ // undefined behavior, so not a good idea; this just makes UBSan ignore the
115
+ // violation, so that our old code can continue to do what it's always been
116
+ // doing.)
117
+ static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow")
118
+ OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) {
119
+ return a * b;
120
+ }
121
+
122
+ // Calculates the probabilities for both speech and background noise using
123
+ // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
124
+ // type of signal is most probable.
125
+ //
126
+ // - self [i/o] : Pointer to VAD instance
127
+ // - features [i] : Feature vector of length |kNumChannels|
128
+ // = log10(energy in frequency band)
129
+ // - total_power [i] : Total power in audio frame.
130
+ // - frame_length [i] : Number of input samples
131
+ //
132
+ // - returns : the VAD decision (0 - noise, 1 - speech).
133
+ static int16_t GmmProbability(VadInstT* self, int16_t* features,
134
+ int16_t total_power, size_t frame_length) {
135
+ int channel, k;
136
+ int16_t feature_minimum;
137
+ int16_t h0, h1;
138
+ int16_t log_likelihood_ratio;
139
+ int16_t vadflag = 0;
140
+ int16_t shifts_h0, shifts_h1;
141
+ int16_t tmp_s16, tmp1_s16, tmp2_s16;
142
+ int16_t diff;
143
+ int gaussian;
144
+ int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
145
+ int16_t delt, ndelt;
146
+ int16_t maxspe, maxmu;
147
+ int16_t deltaN[kTableSize], deltaS[kTableSize];
148
+ int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
149
+ int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
150
+ int32_t h0_test, h1_test;
151
+ int32_t tmp1_s32, tmp2_s32;
152
+ int32_t sum_log_likelihood_ratios = 0;
153
+ int32_t noise_global_mean, speech_global_mean;
154
+ int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
155
+ int16_t overhead1, overhead2, individualTest, totalTest;
156
+
157
+ // Set various thresholds based on frame lengths (80, 160 or 240 samples).
158
+ if (frame_length == 80) {
159
+ overhead1 = self->over_hang_max_1[0];
160
+ overhead2 = self->over_hang_max_2[0];
161
+ individualTest = self->individual[0];
162
+ totalTest = self->total[0];
163
+ } else if (frame_length == 160) {
164
+ overhead1 = self->over_hang_max_1[1];
165
+ overhead2 = self->over_hang_max_2[1];
166
+ individualTest = self->individual[1];
167
+ totalTest = self->total[1];
168
+ } else {
169
+ overhead1 = self->over_hang_max_1[2];
170
+ overhead2 = self->over_hang_max_2[2];
171
+ individualTest = self->individual[2];
172
+ totalTest = self->total[2];
173
+ }
174
+
175
+ if (total_power > kMinEnergy) {
176
+ // The signal power of current frame is large enough for processing. The
177
+ // processing consists of two parts:
178
+ // 1) Calculating the likelihood of speech and thereby a VAD decision.
179
+ // 2) Updating the underlying model, w.r.t., the decision made.
180
+
181
+ // The detection scheme is an LRT with hypothesis
182
+ // H0: Noise
183
+ // H1: Speech
184
+ //
185
+ // We combine a global LRT with local tests, for each frequency sub-band,
186
+ // here defined as |channel|.
187
+ for (channel = 0; channel < kNumChannels; channel++) {
188
+ // For each channel we model the probability with a GMM consisting of
189
+ // |kNumGaussians|, with different means and standard deviations depending
190
+ // on H0 or H1.
191
+ h0_test = 0;
192
+ h1_test = 0;
193
+ for (k = 0; k < kNumGaussians; k++) {
194
+ gaussian = channel + k * kNumChannels;
195
+ // Probability under H0, that is, probability of frame being noise.
196
+ // Value given in Q27 = Q7 * Q20.
197
+ tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
198
+ self->noise_means[gaussian],
199
+ self->noise_stds[gaussian],
200
+ &deltaN[gaussian]);
201
+ noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
202
+ h0_test += noise_probability[k]; // Q27
203
+
204
+ // Probability under H1, that is, probability of frame being speech.
205
+ // Value given in Q27 = Q7 * Q20.
206
+ tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
207
+ self->speech_means[gaussian],
208
+ self->speech_stds[gaussian],
209
+ &deltaS[gaussian]);
210
+ speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
211
+ h1_test += speech_probability[k]; // Q27
212
+ }
213
+
214
+ // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
215
+ // Approximation:
216
+ // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
217
+ // = log2(h1_test) - log2(h0_test)
218
+ // = log2(2^(31-shifts_h1)*(1+b1))
219
+ // - log2(2^(31-shifts_h0)*(1+b0))
220
+ // = shifts_h0 - shifts_h1
221
+ // + log2(1+b1) - log2(1+b0)
222
+ // ~= shifts_h0 - shifts_h1
223
+ //
224
+ // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
225
+ // Further, b0 and b1 are independent and on the average the two terms
226
+ // cancel.
227
+ shifts_h0 = WebRtcSpl_NormW32(h0_test);
228
+ shifts_h1 = WebRtcSpl_NormW32(h1_test);
229
+ if (h0_test == 0) {
230
+ shifts_h0 = 31;
231
+ }
232
+ if (h1_test == 0) {
233
+ shifts_h1 = 31;
234
+ }
235
+ log_likelihood_ratio = shifts_h0 - shifts_h1;
236
+
237
+ // Update |sum_log_likelihood_ratios| with spectrum weighting. This is
238
+ // used for the global VAD decision.
239
+ sum_log_likelihood_ratios +=
240
+ (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
241
+
242
+ // Local VAD decision.
243
+ if ((log_likelihood_ratio * 4) > individualTest) {
244
+ vadflag = 1;
245
+ }
246
+
247
+ // TODO(bjornv): The conditional probabilities below are applied on the
248
+ // hard coded number of Gaussians set to two. Find a way to generalize.
249
+ // Calculate local noise probabilities used later when updating the GMM.
250
+ h0 = (int16_t) (h0_test >> 12); // Q15
251
+ if (h0 > 0) {
252
+ // High probability of noise. Assign conditional probabilities for each
253
+ // Gaussian in the GMM.
254
+ tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29
255
+ ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14
256
+ ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
257
+ } else {
258
+ // Low noise probability. Assign conditional probability 1 to the first
259
+ // Gaussian and 0 to the rest (which is already set at initialization).
260
+ ngprvec[channel] = 16384;
261
+ }
262
+
263
+ // Calculate local speech probabilities used later when updating the GMM.
264
+ h1 = (int16_t) (h1_test >> 12); // Q15
265
+ if (h1 > 0) {
266
+ // High probability of speech. Assign conditional probabilities for each
267
+ // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
268
+ tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29
269
+ sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14
270
+ sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
271
+ }
272
+ }
273
+
274
+ // Make a global VAD decision.
275
+ vadflag |= (sum_log_likelihood_ratios >= totalTest);
276
+
277
+ // Update the model parameters.
278
+ maxspe = 12800;
279
+ for (channel = 0; channel < kNumChannels; channel++) {
280
+
281
+ // Get minimum value in past which is used for long term correction in Q4.
282
+ feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
283
+
284
+ // Compute the "global" mean, that is the sum of the two means weighted.
285
+ noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
286
+ &kNoiseDataWeights[channel]);
287
+ tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8
288
+
289
+ for (k = 0; k < kNumGaussians; k++) {
290
+ gaussian = channel + k * kNumChannels;
291
+
292
+ nmk = self->noise_means[gaussian];
293
+ smk = self->speech_means[gaussian];
294
+ nsk = self->noise_stds[gaussian];
295
+ ssk = self->speech_stds[gaussian];
296
+
297
+ // Update noise mean vector if the frame consists of noise only.
298
+ nmk2 = nmk;
299
+ if (!vadflag) {
300
+ // deltaN = (x-mu)/sigma^2
301
+ // ngprvec[k] = |noise_probability[k]| /
302
+ // (|noise_probability[0]| + |noise_probability[1]|)
303
+
304
+ // (Q14 * Q11 >> 11) = Q14.
305
+ delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
306
+ // Q7 + (Q14 * Q15 >> 22) = Q7.
307
+ nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
308
+ }
309
+
310
+ // Long term correction of the noise mean.
311
+ // Q8 - Q8 = Q8.
312
+ ndelt = (feature_minimum << 4) - tmp1_s16;
313
+ // Q7 + (Q8 * Q8) >> 9 = Q7.
314
+ nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
315
+
316
+ // Control that the noise mean does not drift to much.
317
+ tmp_s16 = (int16_t) ((k + 5) << 7);
318
+ if (nmk3 < tmp_s16) {
319
+ nmk3 = tmp_s16;
320
+ }
321
+ tmp_s16 = (int16_t) ((72 + k - channel) << 7);
322
+ if (nmk3 > tmp_s16) {
323
+ nmk3 = tmp_s16;
324
+ }
325
+ self->noise_means[gaussian] = nmk3;
326
+
327
+ if (vadflag) {
328
+ // Update speech mean vector:
329
+ // |deltaS| = (x-mu)/sigma^2
330
+ // sgprvec[k] = |speech_probability[k]| /
331
+ // (|speech_probability[0]| + |speech_probability[1]|)
332
+
333
+ // (Q14 * Q11) >> 11 = Q14.
334
+ delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
335
+ // Q14 * Q15 >> 21 = Q8.
336
+ tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
337
+ // Q7 + (Q8 >> 1) = Q7. With rounding.
338
+ smk2 = smk + ((tmp_s16 + 1) >> 1);
339
+
340
+ // Control that the speech mean does not drift to much.
341
+ maxmu = maxspe + 640;
342
+ if (smk2 < kMinimumMean[k]) {
343
+ smk2 = kMinimumMean[k];
344
+ }
345
+ if (smk2 > maxmu) {
346
+ smk2 = maxmu;
347
+ }
348
+ self->speech_means[gaussian] = smk2; // Q7.
349
+
350
+ // (Q7 >> 3) = Q4. With rounding.
351
+ tmp_s16 = ((smk + 4) >> 3);
352
+
353
+ tmp_s16 = features[channel] - tmp_s16; // Q4
354
+ // (Q11 * Q4 >> 3) = Q12.
355
+ tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
356
+ tmp2_s32 = tmp1_s32 - 4096;
357
+ tmp_s16 = sgprvec[gaussian] >> 2;
358
+ // (Q14 >> 2) * Q12 = Q24.
359
+ tmp1_s32 = tmp_s16 * tmp2_s32;
360
+
361
+ tmp2_s32 = tmp1_s32 >> 4; // Q20
362
+
363
+ // 0.1 * Q20 / Q7 = Q13.
364
+ if (tmp2_s32 > 0) {
365
+ tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
366
+ } else {
367
+ tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
368
+ tmp_s16 = -tmp_s16;
369
+ }
370
+ // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
371
+ // Note that division by 4 equals shift by 2, hence,
372
+ // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
373
+ tmp_s16 += 128; // Rounding.
374
+ ssk += (tmp_s16 >> 8);
375
+ if (ssk < kMinStd) {
376
+ ssk = kMinStd;
377
+ }
378
+ self->speech_stds[gaussian] = ssk;
379
+ } else {
380
+ // Update GMM variance vectors.
381
+ // deltaN * (features[channel] - nmk) - 1
382
+ // Q4 - (Q7 >> 3) = Q4.
383
+ tmp_s16 = features[channel] - (nmk >> 3);
384
+ // (Q11 * Q4 >> 3) = Q12.
385
+ tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
386
+ tmp1_s32 -= 4096;
387
+
388
+ // (Q14 >> 2) * Q12 = Q24.
389
+ tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
390
+ tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32);
391
+ // Q20 * approx 0.001 (2^-10=0.0009766), hence,
392
+ // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
393
+ tmp1_s32 = tmp2_s32 >> 14;
394
+
395
+ // Q20 / Q7 = Q13.
396
+ if (tmp1_s32 > 0) {
397
+ tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
398
+ } else {
399
+ tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
400
+ tmp_s16 = -tmp_s16;
401
+ }
402
+ tmp_s16 += 32; // Rounding
403
+ nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7.
404
+ if (nsk < kMinStd) {
405
+ nsk = kMinStd;
406
+ }
407
+ self->noise_stds[gaussian] = nsk;
408
+ }
409
+ }
410
+
411
+ // Separate models if they are too close.
412
+ // |noise_global_mean| in Q14 (= Q7 * Q7).
413
+ noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
414
+ &kNoiseDataWeights[channel]);
415
+
416
+ // |speech_global_mean| in Q14 (= Q7 * Q7).
417
+ speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
418
+ &kSpeechDataWeights[channel]);
419
+
420
+ // |diff| = "global" speech mean - "global" noise mean.
421
+ // (Q14 >> 9) - (Q14 >> 9) = Q5.
422
+ diff = (int16_t) (speech_global_mean >> 9) -
423
+ (int16_t) (noise_global_mean >> 9);
424
+ if (diff < kMinimumDifference[channel]) {
425
+ tmp_s16 = kMinimumDifference[channel] - diff;
426
+
427
+ // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
428
+ // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
429
+ tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
430
+ tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
431
+
432
+ // Move Gaussian means for speech model by |tmp1_s16| and update
433
+ // |speech_global_mean|. Note that |self->speech_means[channel]| is
434
+ // changed after the call.
435
+ speech_global_mean = WeightedAverage(&self->speech_means[channel],
436
+ tmp1_s16,
437
+ &kSpeechDataWeights[channel]);
438
+
439
+ // Move Gaussian means for noise model by -|tmp2_s16| and update
440
+ // |noise_global_mean|. Note that |self->noise_means[channel]| is
441
+ // changed after the call.
442
+ noise_global_mean = WeightedAverage(&self->noise_means[channel],
443
+ -tmp2_s16,
444
+ &kNoiseDataWeights[channel]);
445
+ }
446
+
447
+ // Control that the speech & noise means do not drift to much.
448
+ maxspe = kMaximumSpeech[channel];
449
+ tmp2_s16 = (int16_t) (speech_global_mean >> 7);
450
+ if (tmp2_s16 > maxspe) {
451
+ // Upper limit of speech model.
452
+ tmp2_s16 -= maxspe;
453
+
454
+ for (k = 0; k < kNumGaussians; k++) {
455
+ self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
456
+ }
457
+ }
458
+
459
+ tmp2_s16 = (int16_t) (noise_global_mean >> 7);
460
+ if (tmp2_s16 > kMaximumNoise[channel]) {
461
+ tmp2_s16 -= kMaximumNoise[channel];
462
+
463
+ for (k = 0; k < kNumGaussians; k++) {
464
+ self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
465
+ }
466
+ }
467
+ }
468
+ self->frame_counter++;
469
+ }
470
+
471
+ // Smooth with respect to transition hysteresis.
472
+ if (!vadflag) {
473
+ if (self->over_hang > 0) {
474
+ vadflag = 2 + self->over_hang;
475
+ self->over_hang--;
476
+ }
477
+ self->num_of_speech = 0;
478
+ } else {
479
+ self->num_of_speech++;
480
+ if (self->num_of_speech > kMaxSpeechFrames) {
481
+ self->num_of_speech = kMaxSpeechFrames;
482
+ self->over_hang = overhead2;
483
+ } else {
484
+ self->over_hang = overhead1;
485
+ }
486
+ }
487
+ return vadflag;
488
+ }
489
+
490
+ // Initialize the VAD. Set aggressiveness mode to default value.
491
+ int WebRtcVad_InitCore(VadInstT* self) {
492
+ int i;
493
+
494
+ if (self == NULL) {
495
+ return -1;
496
+ }
497
+
498
+ // Initialization of general struct variables.
499
+ self->vad = 1; // Speech active (=1).
500
+ self->frame_counter = 0;
501
+ self->over_hang = 0;
502
+ self->num_of_speech = 0;
503
+
504
+ // Initialization of downsampling filter state.
505
+ memset(self->downsampling_filter_states, 0,
506
+ sizeof(self->downsampling_filter_states));
507
+
508
+ // Initialization of 48 to 8 kHz downsampling.
509
+ WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
510
+
511
+ // Read initial PDF parameters.
512
+ for (i = 0; i < kTableSize; i++) {
513
+ self->noise_means[i] = kNoiseDataMeans[i];
514
+ self->speech_means[i] = kSpeechDataMeans[i];
515
+ self->noise_stds[i] = kNoiseDataStds[i];
516
+ self->speech_stds[i] = kSpeechDataStds[i];
517
+ }
518
+
519
+ // Initialize Index and Minimum value vectors.
520
+ for (i = 0; i < 16 * kNumChannels; i++) {
521
+ self->low_value_vector[i] = 10000;
522
+ self->index_vector[i] = 0;
523
+ }
524
+
525
+ // Initialize splitting filter states.
526
+ memset(self->upper_state, 0, sizeof(self->upper_state));
527
+ memset(self->lower_state, 0, sizeof(self->lower_state));
528
+
529
+ // Initialize high pass filter states.
530
+ memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
531
+
532
+ // Initialize mean value memory, for WebRtcVad_FindMinimum().
533
+ for (i = 0; i < kNumChannels; i++) {
534
+ self->mean_value[i] = 1600;
535
+ }
536
+
537
+ // Set aggressiveness mode to default (=|kDefaultMode|).
538
+ if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
539
+ return -1;
540
+ }
541
+
542
+ self->init_flag = kInitCheck;
543
+
544
+ return 0;
545
+ }
546
+
547
+ // Set aggressiveness mode
548
+ int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
549
+ int return_value = 0;
550
+
551
+ switch (mode) {
552
+ case 0:
553
+ // Quality mode.
554
+ memcpy(self->over_hang_max_1, kOverHangMax1Q,
555
+ sizeof(self->over_hang_max_1));
556
+ memcpy(self->over_hang_max_2, kOverHangMax2Q,
557
+ sizeof(self->over_hang_max_2));
558
+ memcpy(self->individual, kLocalThresholdQ,
559
+ sizeof(self->individual));
560
+ memcpy(self->total, kGlobalThresholdQ,
561
+ sizeof(self->total));
562
+ break;
563
+ case 1:
564
+ // Low bitrate mode.
565
+ memcpy(self->over_hang_max_1, kOverHangMax1LBR,
566
+ sizeof(self->over_hang_max_1));
567
+ memcpy(self->over_hang_max_2, kOverHangMax2LBR,
568
+ sizeof(self->over_hang_max_2));
569
+ memcpy(self->individual, kLocalThresholdLBR,
570
+ sizeof(self->individual));
571
+ memcpy(self->total, kGlobalThresholdLBR,
572
+ sizeof(self->total));
573
+ break;
574
+ case 2:
575
+ // Aggressive mode.
576
+ memcpy(self->over_hang_max_1, kOverHangMax1AGG,
577
+ sizeof(self->over_hang_max_1));
578
+ memcpy(self->over_hang_max_2, kOverHangMax2AGG,
579
+ sizeof(self->over_hang_max_2));
580
+ memcpy(self->individual, kLocalThresholdAGG,
581
+ sizeof(self->individual));
582
+ memcpy(self->total, kGlobalThresholdAGG,
583
+ sizeof(self->total));
584
+ break;
585
+ case 3:
586
+ // Very aggressive mode.
587
+ memcpy(self->over_hang_max_1, kOverHangMax1VAG,
588
+ sizeof(self->over_hang_max_1));
589
+ memcpy(self->over_hang_max_2, kOverHangMax2VAG,
590
+ sizeof(self->over_hang_max_2));
591
+ memcpy(self->individual, kLocalThresholdVAG,
592
+ sizeof(self->individual));
593
+ memcpy(self->total, kGlobalThresholdVAG,
594
+ sizeof(self->total));
595
+ break;
596
+ default:
597
+ return_value = -1;
598
+ break;
599
+ }
600
+
601
+ return return_value;
602
+ }
603
+
604
+ // Calculate VAD decision by first extracting feature values and then calculate
605
+ // probability for both speech and background noise.
606
+
607
+ int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
608
+ size_t frame_length) {
609
+ int vad;
610
+ size_t i;
611
+ int16_t speech_nb[240]; // 30 ms in 8 kHz.
612
+ // |tmp_mem| is a temporary memory used by resample function, length is
613
+ // frame length in 10 ms (480 samples) + 256 extra.
614
+ int32_t tmp_mem[480 + 256] = { 0 };
615
+ const size_t kFrameLen10ms48khz = 480;
616
+ const size_t kFrameLen10ms8khz = 80;
617
+ size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
618
+
619
+ for (i = 0; i < num_10ms_frames; i++) {
620
+ WebRtcSpl_Resample48khzTo8khz(speech_frame,
621
+ &speech_nb[i * kFrameLen10ms8khz],
622
+ &inst->state_48_to_8,
623
+ tmp_mem);
624
+ }
625
+
626
+ // Do VAD on an 8 kHz signal
627
+ vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
628
+
629
+ return vad;
630
+ }
631
+
632
+ int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
633
+ size_t frame_length)
634
+ {
635
+ size_t len;
636
+ int vad;
637
+ int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
638
+ int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
639
+
640
+
641
+ // Downsample signal 32->16->8 before doing VAD
642
+ WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
643
+ frame_length);
644
+ len = frame_length / 2;
645
+
646
+ WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
647
+ len /= 2;
648
+
649
+ // Do VAD on an 8 kHz signal
650
+ vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
651
+
652
+ return vad;
653
+ }
654
+
655
+ int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
656
+ size_t frame_length)
657
+ {
658
+ size_t len;
659
+ int vad;
660
+ int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
661
+
662
+ // Wideband: Downsample signal before doing VAD
663
+ WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
664
+ frame_length);
665
+
666
+ len = frame_length / 2;
667
+ vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
668
+
669
+ return vad;
670
+ }
671
+
672
+ int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
673
+ size_t frame_length)
674
+ {
675
+ int16_t feature_vector[kNumChannels], total_power;
676
+
677
+ // Get power in the bands
678
+ total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
679
+ feature_vector);
680
+
681
+ // Make a VAD
682
+ inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
683
+
684
+ return inst->vad;
685
+ }