webrtcvad 0.1.0 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/webrtcvad/extconf.rb +29 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/division_operations.c +141 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/dot_product_with_scale.h +40 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/energy.c +39 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/get_scaling_square.c +46 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/signal_processing_library.h +1605 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/spl_inl.h +153 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_48khz.c +186 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.c +689 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.h +60 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_fractional.c +239 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c +77 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h +29 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_mips.c +207 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/include/webrtc_vad.h +87 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.c +685 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.h +114 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.c +329 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.h +45 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.c +82 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.h +39 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.c +176 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.h +54 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/webrtc_vad.c +114 -0
- data/ext/webrtcvad/webrtc/rtc_base/checks.cc +207 -0
- data/ext/webrtcvad/webrtc/rtc_base/checks.h +400 -0
- data/ext/webrtcvad/webrtc/rtc_base/compile_assert_c.h +25 -0
- data/ext/webrtcvad/webrtc/rtc_base/numerics/safe_compare.h +176 -0
- data/ext/webrtcvad/webrtc/rtc_base/sanitizer.h +144 -0
- data/ext/webrtcvad/webrtc/rtc_base/system/inline.h +31 -0
- data/ext/webrtcvad/webrtc/rtc_base/system/rtc_export.h +43 -0
- data/ext/webrtcvad/webrtc/rtc_base/type_traits.h +140 -0
- data/ext/webrtcvad/webrtcvad.c +112 -0
- metadata +37 -3
@@ -0,0 +1,87 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license
|
5
|
+
* that can be found in the LICENSE file in the root of the source
|
6
|
+
* tree. An additional intellectual property rights grant can be found
|
7
|
+
* in the file PATENTS. All contributing project authors may
|
8
|
+
* be found in the AUTHORS file in the root of the source tree.
|
9
|
+
*/
|
10
|
+
|
11
|
+
/*
|
12
|
+
* This header file includes the VAD API calls. Specific function calls are
|
13
|
+
* given below.
|
14
|
+
*/
|
15
|
+
|
16
|
+
#ifndef COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
|
17
|
+
#define COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_
|
18
|
+
|
19
|
+
#include <stddef.h>
|
20
|
+
#include <stdint.h>
|
21
|
+
|
22
|
+
typedef struct WebRtcVadInst VadInst;
|
23
|
+
|
24
|
+
#ifdef __cplusplus
|
25
|
+
extern "C" {
|
26
|
+
#endif
|
27
|
+
|
28
|
+
// Creates an instance to the VAD structure.
|
29
|
+
VadInst* WebRtcVad_Create(void);
|
30
|
+
|
31
|
+
// Frees the dynamic memory of a specified VAD instance.
|
32
|
+
//
|
33
|
+
// - handle [i] : Pointer to VAD instance that should be freed.
|
34
|
+
void WebRtcVad_Free(VadInst* handle);
|
35
|
+
|
36
|
+
// Initializes a VAD instance.
|
37
|
+
//
|
38
|
+
// - handle [i/o] : Instance that should be initialized.
|
39
|
+
//
|
40
|
+
// returns : 0 - (OK),
|
41
|
+
// -1 - (null pointer or Default mode could not be set).
|
42
|
+
int WebRtcVad_Init(VadInst* handle);
|
43
|
+
|
44
|
+
// Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
|
45
|
+
// restrictive in reporting speech. Put in other words the probability of being
|
46
|
+
// speech when the VAD returns 1 is increased with increasing mode. As a
|
47
|
+
// consequence also the missed detection rate goes up.
|
48
|
+
//
|
49
|
+
// - handle [i/o] : VAD instance.
|
50
|
+
// - mode [i] : Aggressiveness mode (0, 1, 2, or 3).
|
51
|
+
//
|
52
|
+
// returns : 0 - (OK),
|
53
|
+
// -1 - (null pointer, mode could not be set or the VAD instance
|
54
|
+
// has not been initialized).
|
55
|
+
int WebRtcVad_set_mode(VadInst* handle, int mode);
|
56
|
+
|
57
|
+
// Calculates a VAD decision for the |audio_frame|. For valid sampling rates
|
58
|
+
// frame lengths, see the description of WebRtcVad_ValidRatesAndFrameLengths().
|
59
|
+
//
|
60
|
+
// - handle [i/o] : VAD Instance. Needs to be initialized by
|
61
|
+
// WebRtcVad_Init() before call.
|
62
|
+
// - fs [i] : Sampling frequency (Hz): 8000, 16000, or 32000
|
63
|
+
// - audio_frame [i] : Audio frame buffer.
|
64
|
+
// - frame_length [i] : Length of audio frame buffer in number of samples.
|
65
|
+
//
|
66
|
+
// returns : 1 - (Active Voice),
|
67
|
+
// 0 - (Non-active Voice),
|
68
|
+
// -1 - (Error)
|
69
|
+
int WebRtcVad_Process(VadInst* handle,
|
70
|
+
int fs,
|
71
|
+
const int16_t* audio_frame,
|
72
|
+
size_t frame_length);
|
73
|
+
|
74
|
+
// Checks for valid combinations of |rate| and |frame_length|. We support 10,
|
75
|
+
// 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz.
|
76
|
+
//
|
77
|
+
// - rate [i] : Sampling frequency (Hz).
|
78
|
+
// - frame_length [i] : Speech frame buffer length in number of samples.
|
79
|
+
//
|
80
|
+
// returns : 0 - (valid combination), -1 - (invalid combination)
|
81
|
+
int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length);
|
82
|
+
|
83
|
+
#ifdef __cplusplus
|
84
|
+
}
|
85
|
+
#endif
|
86
|
+
|
87
|
+
#endif // COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
|
@@ -0,0 +1,685 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license
|
5
|
+
* that can be found in the LICENSE file in the root of the source
|
6
|
+
* tree. An additional intellectual property rights grant can be found
|
7
|
+
* in the file PATENTS. All contributing project authors may
|
8
|
+
* be found in the AUTHORS file in the root of the source tree.
|
9
|
+
*/
|
10
|
+
|
11
|
+
#include "common_audio/vad/vad_core.h"
|
12
|
+
|
13
|
+
#include "rtc_base/sanitizer.h"
|
14
|
+
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
15
|
+
#include "common_audio/vad/vad_filterbank.h"
|
16
|
+
#include "common_audio/vad/vad_gmm.h"
|
17
|
+
#include "common_audio/vad/vad_sp.h"
|
18
|
+
|
19
|
+
// Spectrum Weighting
|
20
|
+
static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
|
21
|
+
static const int16_t kNoiseUpdateConst = 655; // Q15
|
22
|
+
static const int16_t kSpeechUpdateConst = 6554; // Q15
|
23
|
+
static const int16_t kBackEta = 154; // Q8
|
24
|
+
// Minimum difference between the two models, Q5
|
25
|
+
static const int16_t kMinimumDifference[kNumChannels] = {
|
26
|
+
544, 544, 576, 576, 576, 576 };
|
27
|
+
// Upper limit of mean value for speech model, Q7
|
28
|
+
static const int16_t kMaximumSpeech[kNumChannels] = {
|
29
|
+
11392, 11392, 11520, 11520, 11520, 11520 };
|
30
|
+
// Minimum value for mean value
|
31
|
+
static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
|
32
|
+
// Upper limit of mean value for noise model, Q7
|
33
|
+
static const int16_t kMaximumNoise[kNumChannels] = {
|
34
|
+
9216, 9088, 8960, 8832, 8704, 8576 };
|
35
|
+
// Start values for the Gaussian models, Q7
|
36
|
+
// Weights for the two Gaussians for the six channels (noise)
|
37
|
+
static const int16_t kNoiseDataWeights[kTableSize] = {
|
38
|
+
34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
|
39
|
+
// Weights for the two Gaussians for the six channels (speech)
|
40
|
+
static const int16_t kSpeechDataWeights[kTableSize] = {
|
41
|
+
48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
|
42
|
+
// Means for the two Gaussians for the six channels (noise)
|
43
|
+
static const int16_t kNoiseDataMeans[kTableSize] = {
|
44
|
+
6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
|
45
|
+
// Means for the two Gaussians for the six channels (speech)
|
46
|
+
static const int16_t kSpeechDataMeans[kTableSize] = {
|
47
|
+
8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
|
48
|
+
};
|
49
|
+
// Stds for the two Gaussians for the six channels (noise)
|
50
|
+
static const int16_t kNoiseDataStds[kTableSize] = {
|
51
|
+
378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
|
52
|
+
// Stds for the two Gaussians for the six channels (speech)
|
53
|
+
static const int16_t kSpeechDataStds[kTableSize] = {
|
54
|
+
555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
|
55
|
+
|
56
|
+
// Constants used in GmmProbability().
|
57
|
+
//
|
58
|
+
// Maximum number of counted speech (VAD = 1) frames in a row.
|
59
|
+
static const int16_t kMaxSpeechFrames = 6;
|
60
|
+
// Minimum standard deviation for both speech and noise.
|
61
|
+
static const int16_t kMinStd = 384;
|
62
|
+
|
63
|
+
// Constants in WebRtcVad_InitCore().
|
64
|
+
// Default aggressiveness mode.
|
65
|
+
static const short kDefaultMode = 0;
|
66
|
+
static const int kInitCheck = 42;
|
67
|
+
|
68
|
+
// Constants used in WebRtcVad_set_mode_core().
|
69
|
+
//
|
70
|
+
// Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
|
71
|
+
//
|
72
|
+
// Mode 0, Quality.
|
73
|
+
static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
|
74
|
+
static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
|
75
|
+
static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
|
76
|
+
static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
|
77
|
+
// Mode 1, Low bitrate.
|
78
|
+
static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
|
79
|
+
static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
|
80
|
+
static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
|
81
|
+
static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
|
82
|
+
// Mode 2, Aggressive.
|
83
|
+
static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
|
84
|
+
static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
|
85
|
+
static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
|
86
|
+
static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
|
87
|
+
// Mode 3, Very aggressive.
|
88
|
+
static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
|
89
|
+
static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
|
90
|
+
static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
|
91
|
+
static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
|
92
|
+
|
93
|
+
// Calculates the weighted average w.r.t. number of Gaussians. The |data| are
|
94
|
+
// updated with an |offset| before averaging.
|
95
|
+
//
|
96
|
+
// - data [i/o] : Data to average.
|
97
|
+
// - offset [i] : An offset added to |data|.
|
98
|
+
// - weights [i] : Weights used for averaging.
|
99
|
+
//
|
100
|
+
// returns : The weighted average.
|
101
|
+
static int32_t WeightedAverage(int16_t* data, int16_t offset,
|
102
|
+
const int16_t* weights) {
|
103
|
+
int k;
|
104
|
+
int32_t weighted_average = 0;
|
105
|
+
|
106
|
+
for (k = 0; k < kNumGaussians; k++) {
|
107
|
+
data[k * kNumChannels] += offset;
|
108
|
+
weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
|
109
|
+
}
|
110
|
+
return weighted_average;
|
111
|
+
}
|
112
|
+
|
113
|
+
// An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
|
114
|
+
// undefined behavior, so not a good idea; this just makes UBSan ignore the
|
115
|
+
// violation, so that our old code can continue to do what it's always been
|
116
|
+
// doing.)
|
117
|
+
static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow")
|
118
|
+
OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) {
|
119
|
+
return a * b;
|
120
|
+
}
|
121
|
+
|
122
|
+
// Calculates the probabilities for both speech and background noise using
|
123
|
+
// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
|
124
|
+
// type of signal is most probable.
|
125
|
+
//
|
126
|
+
// - self [i/o] : Pointer to VAD instance
|
127
|
+
// - features [i] : Feature vector of length |kNumChannels|
|
128
|
+
// = log10(energy in frequency band)
|
129
|
+
// - total_power [i] : Total power in audio frame.
|
130
|
+
// - frame_length [i] : Number of input samples
|
131
|
+
//
|
132
|
+
// - returns : the VAD decision (0 - noise, 1 - speech).
|
133
|
+
static int16_t GmmProbability(VadInstT* self, int16_t* features,
|
134
|
+
int16_t total_power, size_t frame_length) {
|
135
|
+
int channel, k;
|
136
|
+
int16_t feature_minimum;
|
137
|
+
int16_t h0, h1;
|
138
|
+
int16_t log_likelihood_ratio;
|
139
|
+
int16_t vadflag = 0;
|
140
|
+
int16_t shifts_h0, shifts_h1;
|
141
|
+
int16_t tmp_s16, tmp1_s16, tmp2_s16;
|
142
|
+
int16_t diff;
|
143
|
+
int gaussian;
|
144
|
+
int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
|
145
|
+
int16_t delt, ndelt;
|
146
|
+
int16_t maxspe, maxmu;
|
147
|
+
int16_t deltaN[kTableSize], deltaS[kTableSize];
|
148
|
+
int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
|
149
|
+
int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
|
150
|
+
int32_t h0_test, h1_test;
|
151
|
+
int32_t tmp1_s32, tmp2_s32;
|
152
|
+
int32_t sum_log_likelihood_ratios = 0;
|
153
|
+
int32_t noise_global_mean, speech_global_mean;
|
154
|
+
int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
|
155
|
+
int16_t overhead1, overhead2, individualTest, totalTest;
|
156
|
+
|
157
|
+
// Set various thresholds based on frame lengths (80, 160 or 240 samples).
|
158
|
+
if (frame_length == 80) {
|
159
|
+
overhead1 = self->over_hang_max_1[0];
|
160
|
+
overhead2 = self->over_hang_max_2[0];
|
161
|
+
individualTest = self->individual[0];
|
162
|
+
totalTest = self->total[0];
|
163
|
+
} else if (frame_length == 160) {
|
164
|
+
overhead1 = self->over_hang_max_1[1];
|
165
|
+
overhead2 = self->over_hang_max_2[1];
|
166
|
+
individualTest = self->individual[1];
|
167
|
+
totalTest = self->total[1];
|
168
|
+
} else {
|
169
|
+
overhead1 = self->over_hang_max_1[2];
|
170
|
+
overhead2 = self->over_hang_max_2[2];
|
171
|
+
individualTest = self->individual[2];
|
172
|
+
totalTest = self->total[2];
|
173
|
+
}
|
174
|
+
|
175
|
+
if (total_power > kMinEnergy) {
|
176
|
+
// The signal power of current frame is large enough for processing. The
|
177
|
+
// processing consists of two parts:
|
178
|
+
// 1) Calculating the likelihood of speech and thereby a VAD decision.
|
179
|
+
// 2) Updating the underlying model, w.r.t., the decision made.
|
180
|
+
|
181
|
+
// The detection scheme is an LRT with hypothesis
|
182
|
+
// H0: Noise
|
183
|
+
// H1: Speech
|
184
|
+
//
|
185
|
+
// We combine a global LRT with local tests, for each frequency sub-band,
|
186
|
+
// here defined as |channel|.
|
187
|
+
for (channel = 0; channel < kNumChannels; channel++) {
|
188
|
+
// For each channel we model the probability with a GMM consisting of
|
189
|
+
// |kNumGaussians|, with different means and standard deviations depending
|
190
|
+
// on H0 or H1.
|
191
|
+
h0_test = 0;
|
192
|
+
h1_test = 0;
|
193
|
+
for (k = 0; k < kNumGaussians; k++) {
|
194
|
+
gaussian = channel + k * kNumChannels;
|
195
|
+
// Probability under H0, that is, probability of frame being noise.
|
196
|
+
// Value given in Q27 = Q7 * Q20.
|
197
|
+
tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
|
198
|
+
self->noise_means[gaussian],
|
199
|
+
self->noise_stds[gaussian],
|
200
|
+
&deltaN[gaussian]);
|
201
|
+
noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
|
202
|
+
h0_test += noise_probability[k]; // Q27
|
203
|
+
|
204
|
+
// Probability under H1, that is, probability of frame being speech.
|
205
|
+
// Value given in Q27 = Q7 * Q20.
|
206
|
+
tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
|
207
|
+
self->speech_means[gaussian],
|
208
|
+
self->speech_stds[gaussian],
|
209
|
+
&deltaS[gaussian]);
|
210
|
+
speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
|
211
|
+
h1_test += speech_probability[k]; // Q27
|
212
|
+
}
|
213
|
+
|
214
|
+
// Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
|
215
|
+
// Approximation:
|
216
|
+
// log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
|
217
|
+
// = log2(h1_test) - log2(h0_test)
|
218
|
+
// = log2(2^(31-shifts_h1)*(1+b1))
|
219
|
+
// - log2(2^(31-shifts_h0)*(1+b0))
|
220
|
+
// = shifts_h0 - shifts_h1
|
221
|
+
// + log2(1+b1) - log2(1+b0)
|
222
|
+
// ~= shifts_h0 - shifts_h1
|
223
|
+
//
|
224
|
+
// Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
|
225
|
+
// Further, b0 and b1 are independent and on the average the two terms
|
226
|
+
// cancel.
|
227
|
+
shifts_h0 = WebRtcSpl_NormW32(h0_test);
|
228
|
+
shifts_h1 = WebRtcSpl_NormW32(h1_test);
|
229
|
+
if (h0_test == 0) {
|
230
|
+
shifts_h0 = 31;
|
231
|
+
}
|
232
|
+
if (h1_test == 0) {
|
233
|
+
shifts_h1 = 31;
|
234
|
+
}
|
235
|
+
log_likelihood_ratio = shifts_h0 - shifts_h1;
|
236
|
+
|
237
|
+
// Update |sum_log_likelihood_ratios| with spectrum weighting. This is
|
238
|
+
// used for the global VAD decision.
|
239
|
+
sum_log_likelihood_ratios +=
|
240
|
+
(int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
|
241
|
+
|
242
|
+
// Local VAD decision.
|
243
|
+
if ((log_likelihood_ratio * 4) > individualTest) {
|
244
|
+
vadflag = 1;
|
245
|
+
}
|
246
|
+
|
247
|
+
// TODO(bjornv): The conditional probabilities below are applied on the
|
248
|
+
// hard coded number of Gaussians set to two. Find a way to generalize.
|
249
|
+
// Calculate local noise probabilities used later when updating the GMM.
|
250
|
+
h0 = (int16_t) (h0_test >> 12); // Q15
|
251
|
+
if (h0 > 0) {
|
252
|
+
// High probability of noise. Assign conditional probabilities for each
|
253
|
+
// Gaussian in the GMM.
|
254
|
+
tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29
|
255
|
+
ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14
|
256
|
+
ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
|
257
|
+
} else {
|
258
|
+
// Low noise probability. Assign conditional probability 1 to the first
|
259
|
+
// Gaussian and 0 to the rest (which is already set at initialization).
|
260
|
+
ngprvec[channel] = 16384;
|
261
|
+
}
|
262
|
+
|
263
|
+
// Calculate local speech probabilities used later when updating the GMM.
|
264
|
+
h1 = (int16_t) (h1_test >> 12); // Q15
|
265
|
+
if (h1 > 0) {
|
266
|
+
// High probability of speech. Assign conditional probabilities for each
|
267
|
+
// Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
|
268
|
+
tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29
|
269
|
+
sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14
|
270
|
+
sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
|
271
|
+
}
|
272
|
+
}
|
273
|
+
|
274
|
+
// Make a global VAD decision.
|
275
|
+
vadflag |= (sum_log_likelihood_ratios >= totalTest);
|
276
|
+
|
277
|
+
// Update the model parameters.
|
278
|
+
maxspe = 12800;
|
279
|
+
for (channel = 0; channel < kNumChannels; channel++) {
|
280
|
+
|
281
|
+
// Get minimum value in past which is used for long term correction in Q4.
|
282
|
+
feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
|
283
|
+
|
284
|
+
// Compute the "global" mean, that is the sum of the two means weighted.
|
285
|
+
noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
|
286
|
+
&kNoiseDataWeights[channel]);
|
287
|
+
tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8
|
288
|
+
|
289
|
+
for (k = 0; k < kNumGaussians; k++) {
|
290
|
+
gaussian = channel + k * kNumChannels;
|
291
|
+
|
292
|
+
nmk = self->noise_means[gaussian];
|
293
|
+
smk = self->speech_means[gaussian];
|
294
|
+
nsk = self->noise_stds[gaussian];
|
295
|
+
ssk = self->speech_stds[gaussian];
|
296
|
+
|
297
|
+
// Update noise mean vector if the frame consists of noise only.
|
298
|
+
nmk2 = nmk;
|
299
|
+
if (!vadflag) {
|
300
|
+
// deltaN = (x-mu)/sigma^2
|
301
|
+
// ngprvec[k] = |noise_probability[k]| /
|
302
|
+
// (|noise_probability[0]| + |noise_probability[1]|)
|
303
|
+
|
304
|
+
// (Q14 * Q11 >> 11) = Q14.
|
305
|
+
delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
|
306
|
+
// Q7 + (Q14 * Q15 >> 22) = Q7.
|
307
|
+
nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
|
308
|
+
}
|
309
|
+
|
310
|
+
// Long term correction of the noise mean.
|
311
|
+
// Q8 - Q8 = Q8.
|
312
|
+
ndelt = (feature_minimum << 4) - tmp1_s16;
|
313
|
+
// Q7 + (Q8 * Q8) >> 9 = Q7.
|
314
|
+
nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
|
315
|
+
|
316
|
+
// Control that the noise mean does not drift to much.
|
317
|
+
tmp_s16 = (int16_t) ((k + 5) << 7);
|
318
|
+
if (nmk3 < tmp_s16) {
|
319
|
+
nmk3 = tmp_s16;
|
320
|
+
}
|
321
|
+
tmp_s16 = (int16_t) ((72 + k - channel) << 7);
|
322
|
+
if (nmk3 > tmp_s16) {
|
323
|
+
nmk3 = tmp_s16;
|
324
|
+
}
|
325
|
+
self->noise_means[gaussian] = nmk3;
|
326
|
+
|
327
|
+
if (vadflag) {
|
328
|
+
// Update speech mean vector:
|
329
|
+
// |deltaS| = (x-mu)/sigma^2
|
330
|
+
// sgprvec[k] = |speech_probability[k]| /
|
331
|
+
// (|speech_probability[0]| + |speech_probability[1]|)
|
332
|
+
|
333
|
+
// (Q14 * Q11) >> 11 = Q14.
|
334
|
+
delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
|
335
|
+
// Q14 * Q15 >> 21 = Q8.
|
336
|
+
tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
|
337
|
+
// Q7 + (Q8 >> 1) = Q7. With rounding.
|
338
|
+
smk2 = smk + ((tmp_s16 + 1) >> 1);
|
339
|
+
|
340
|
+
// Control that the speech mean does not drift to much.
|
341
|
+
maxmu = maxspe + 640;
|
342
|
+
if (smk2 < kMinimumMean[k]) {
|
343
|
+
smk2 = kMinimumMean[k];
|
344
|
+
}
|
345
|
+
if (smk2 > maxmu) {
|
346
|
+
smk2 = maxmu;
|
347
|
+
}
|
348
|
+
self->speech_means[gaussian] = smk2; // Q7.
|
349
|
+
|
350
|
+
// (Q7 >> 3) = Q4. With rounding.
|
351
|
+
tmp_s16 = ((smk + 4) >> 3);
|
352
|
+
|
353
|
+
tmp_s16 = features[channel] - tmp_s16; // Q4
|
354
|
+
// (Q11 * Q4 >> 3) = Q12.
|
355
|
+
tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
|
356
|
+
tmp2_s32 = tmp1_s32 - 4096;
|
357
|
+
tmp_s16 = sgprvec[gaussian] >> 2;
|
358
|
+
// (Q14 >> 2) * Q12 = Q24.
|
359
|
+
tmp1_s32 = tmp_s16 * tmp2_s32;
|
360
|
+
|
361
|
+
tmp2_s32 = tmp1_s32 >> 4; // Q20
|
362
|
+
|
363
|
+
// 0.1 * Q20 / Q7 = Q13.
|
364
|
+
if (tmp2_s32 > 0) {
|
365
|
+
tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
|
366
|
+
} else {
|
367
|
+
tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
|
368
|
+
tmp_s16 = -tmp_s16;
|
369
|
+
}
|
370
|
+
// Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
|
371
|
+
// Note that division by 4 equals shift by 2, hence,
|
372
|
+
// (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
|
373
|
+
tmp_s16 += 128; // Rounding.
|
374
|
+
ssk += (tmp_s16 >> 8);
|
375
|
+
if (ssk < kMinStd) {
|
376
|
+
ssk = kMinStd;
|
377
|
+
}
|
378
|
+
self->speech_stds[gaussian] = ssk;
|
379
|
+
} else {
|
380
|
+
// Update GMM variance vectors.
|
381
|
+
// deltaN * (features[channel] - nmk) - 1
|
382
|
+
// Q4 - (Q7 >> 3) = Q4.
|
383
|
+
tmp_s16 = features[channel] - (nmk >> 3);
|
384
|
+
// (Q11 * Q4 >> 3) = Q12.
|
385
|
+
tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
|
386
|
+
tmp1_s32 -= 4096;
|
387
|
+
|
388
|
+
// (Q14 >> 2) * Q12 = Q24.
|
389
|
+
tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
|
390
|
+
tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32);
|
391
|
+
// Q20 * approx 0.001 (2^-10=0.0009766), hence,
|
392
|
+
// (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
|
393
|
+
tmp1_s32 = tmp2_s32 >> 14;
|
394
|
+
|
395
|
+
// Q20 / Q7 = Q13.
|
396
|
+
if (tmp1_s32 > 0) {
|
397
|
+
tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
|
398
|
+
} else {
|
399
|
+
tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
|
400
|
+
tmp_s16 = -tmp_s16;
|
401
|
+
}
|
402
|
+
tmp_s16 += 32; // Rounding
|
403
|
+
nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7.
|
404
|
+
if (nsk < kMinStd) {
|
405
|
+
nsk = kMinStd;
|
406
|
+
}
|
407
|
+
self->noise_stds[gaussian] = nsk;
|
408
|
+
}
|
409
|
+
}
|
410
|
+
|
411
|
+
// Separate models if they are too close.
|
412
|
+
// |noise_global_mean| in Q14 (= Q7 * Q7).
|
413
|
+
noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
|
414
|
+
&kNoiseDataWeights[channel]);
|
415
|
+
|
416
|
+
// |speech_global_mean| in Q14 (= Q7 * Q7).
|
417
|
+
speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
|
418
|
+
&kSpeechDataWeights[channel]);
|
419
|
+
|
420
|
+
// |diff| = "global" speech mean - "global" noise mean.
|
421
|
+
// (Q14 >> 9) - (Q14 >> 9) = Q5.
|
422
|
+
diff = (int16_t) (speech_global_mean >> 9) -
|
423
|
+
(int16_t) (noise_global_mean >> 9);
|
424
|
+
if (diff < kMinimumDifference[channel]) {
|
425
|
+
tmp_s16 = kMinimumDifference[channel] - diff;
|
426
|
+
|
427
|
+
// |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
|
428
|
+
// |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
|
429
|
+
tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
|
430
|
+
tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
|
431
|
+
|
432
|
+
// Move Gaussian means for speech model by |tmp1_s16| and update
|
433
|
+
// |speech_global_mean|. Note that |self->speech_means[channel]| is
|
434
|
+
// changed after the call.
|
435
|
+
speech_global_mean = WeightedAverage(&self->speech_means[channel],
|
436
|
+
tmp1_s16,
|
437
|
+
&kSpeechDataWeights[channel]);
|
438
|
+
|
439
|
+
// Move Gaussian means for noise model by -|tmp2_s16| and update
|
440
|
+
// |noise_global_mean|. Note that |self->noise_means[channel]| is
|
441
|
+
// changed after the call.
|
442
|
+
noise_global_mean = WeightedAverage(&self->noise_means[channel],
|
443
|
+
-tmp2_s16,
|
444
|
+
&kNoiseDataWeights[channel]);
|
445
|
+
}
|
446
|
+
|
447
|
+
// Control that the speech & noise means do not drift to much.
|
448
|
+
maxspe = kMaximumSpeech[channel];
|
449
|
+
tmp2_s16 = (int16_t) (speech_global_mean >> 7);
|
450
|
+
if (tmp2_s16 > maxspe) {
|
451
|
+
// Upper limit of speech model.
|
452
|
+
tmp2_s16 -= maxspe;
|
453
|
+
|
454
|
+
for (k = 0; k < kNumGaussians; k++) {
|
455
|
+
self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
|
456
|
+
}
|
457
|
+
}
|
458
|
+
|
459
|
+
tmp2_s16 = (int16_t) (noise_global_mean >> 7);
|
460
|
+
if (tmp2_s16 > kMaximumNoise[channel]) {
|
461
|
+
tmp2_s16 -= kMaximumNoise[channel];
|
462
|
+
|
463
|
+
for (k = 0; k < kNumGaussians; k++) {
|
464
|
+
self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
|
465
|
+
}
|
466
|
+
}
|
467
|
+
}
|
468
|
+
self->frame_counter++;
|
469
|
+
}
|
470
|
+
|
471
|
+
// Smooth with respect to transition hysteresis.
|
472
|
+
if (!vadflag) {
|
473
|
+
if (self->over_hang > 0) {
|
474
|
+
vadflag = 2 + self->over_hang;
|
475
|
+
self->over_hang--;
|
476
|
+
}
|
477
|
+
self->num_of_speech = 0;
|
478
|
+
} else {
|
479
|
+
self->num_of_speech++;
|
480
|
+
if (self->num_of_speech > kMaxSpeechFrames) {
|
481
|
+
self->num_of_speech = kMaxSpeechFrames;
|
482
|
+
self->over_hang = overhead2;
|
483
|
+
} else {
|
484
|
+
self->over_hang = overhead1;
|
485
|
+
}
|
486
|
+
}
|
487
|
+
return vadflag;
|
488
|
+
}
|
489
|
+
|
490
|
+
// Initialize the VAD. Set aggressiveness mode to default value.
|
491
|
+
int WebRtcVad_InitCore(VadInstT* self) {
|
492
|
+
int i;
|
493
|
+
|
494
|
+
if (self == NULL) {
|
495
|
+
return -1;
|
496
|
+
}
|
497
|
+
|
498
|
+
// Initialization of general struct variables.
|
499
|
+
self->vad = 1; // Speech active (=1).
|
500
|
+
self->frame_counter = 0;
|
501
|
+
self->over_hang = 0;
|
502
|
+
self->num_of_speech = 0;
|
503
|
+
|
504
|
+
// Initialization of downsampling filter state.
|
505
|
+
memset(self->downsampling_filter_states, 0,
|
506
|
+
sizeof(self->downsampling_filter_states));
|
507
|
+
|
508
|
+
// Initialization of 48 to 8 kHz downsampling.
|
509
|
+
WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
|
510
|
+
|
511
|
+
// Read initial PDF parameters.
|
512
|
+
for (i = 0; i < kTableSize; i++) {
|
513
|
+
self->noise_means[i] = kNoiseDataMeans[i];
|
514
|
+
self->speech_means[i] = kSpeechDataMeans[i];
|
515
|
+
self->noise_stds[i] = kNoiseDataStds[i];
|
516
|
+
self->speech_stds[i] = kSpeechDataStds[i];
|
517
|
+
}
|
518
|
+
|
519
|
+
// Initialize Index and Minimum value vectors.
|
520
|
+
for (i = 0; i < 16 * kNumChannels; i++) {
|
521
|
+
self->low_value_vector[i] = 10000;
|
522
|
+
self->index_vector[i] = 0;
|
523
|
+
}
|
524
|
+
|
525
|
+
// Initialize splitting filter states.
|
526
|
+
memset(self->upper_state, 0, sizeof(self->upper_state));
|
527
|
+
memset(self->lower_state, 0, sizeof(self->lower_state));
|
528
|
+
|
529
|
+
// Initialize high pass filter states.
|
530
|
+
memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
|
531
|
+
|
532
|
+
// Initialize mean value memory, for WebRtcVad_FindMinimum().
|
533
|
+
for (i = 0; i < kNumChannels; i++) {
|
534
|
+
self->mean_value[i] = 1600;
|
535
|
+
}
|
536
|
+
|
537
|
+
// Set aggressiveness mode to default (=|kDefaultMode|).
|
538
|
+
if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
|
539
|
+
return -1;
|
540
|
+
}
|
541
|
+
|
542
|
+
self->init_flag = kInitCheck;
|
543
|
+
|
544
|
+
return 0;
|
545
|
+
}
|
546
|
+
|
547
|
+
// Set aggressiveness mode
|
548
|
+
int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
|
549
|
+
int return_value = 0;
|
550
|
+
|
551
|
+
switch (mode) {
|
552
|
+
case 0:
|
553
|
+
// Quality mode.
|
554
|
+
memcpy(self->over_hang_max_1, kOverHangMax1Q,
|
555
|
+
sizeof(self->over_hang_max_1));
|
556
|
+
memcpy(self->over_hang_max_2, kOverHangMax2Q,
|
557
|
+
sizeof(self->over_hang_max_2));
|
558
|
+
memcpy(self->individual, kLocalThresholdQ,
|
559
|
+
sizeof(self->individual));
|
560
|
+
memcpy(self->total, kGlobalThresholdQ,
|
561
|
+
sizeof(self->total));
|
562
|
+
break;
|
563
|
+
case 1:
|
564
|
+
// Low bitrate mode.
|
565
|
+
memcpy(self->over_hang_max_1, kOverHangMax1LBR,
|
566
|
+
sizeof(self->over_hang_max_1));
|
567
|
+
memcpy(self->over_hang_max_2, kOverHangMax2LBR,
|
568
|
+
sizeof(self->over_hang_max_2));
|
569
|
+
memcpy(self->individual, kLocalThresholdLBR,
|
570
|
+
sizeof(self->individual));
|
571
|
+
memcpy(self->total, kGlobalThresholdLBR,
|
572
|
+
sizeof(self->total));
|
573
|
+
break;
|
574
|
+
case 2:
|
575
|
+
// Aggressive mode.
|
576
|
+
memcpy(self->over_hang_max_1, kOverHangMax1AGG,
|
577
|
+
sizeof(self->over_hang_max_1));
|
578
|
+
memcpy(self->over_hang_max_2, kOverHangMax2AGG,
|
579
|
+
sizeof(self->over_hang_max_2));
|
580
|
+
memcpy(self->individual, kLocalThresholdAGG,
|
581
|
+
sizeof(self->individual));
|
582
|
+
memcpy(self->total, kGlobalThresholdAGG,
|
583
|
+
sizeof(self->total));
|
584
|
+
break;
|
585
|
+
case 3:
|
586
|
+
// Very aggressive mode.
|
587
|
+
memcpy(self->over_hang_max_1, kOverHangMax1VAG,
|
588
|
+
sizeof(self->over_hang_max_1));
|
589
|
+
memcpy(self->over_hang_max_2, kOverHangMax2VAG,
|
590
|
+
sizeof(self->over_hang_max_2));
|
591
|
+
memcpy(self->individual, kLocalThresholdVAG,
|
592
|
+
sizeof(self->individual));
|
593
|
+
memcpy(self->total, kGlobalThresholdVAG,
|
594
|
+
sizeof(self->total));
|
595
|
+
break;
|
596
|
+
default:
|
597
|
+
return_value = -1;
|
598
|
+
break;
|
599
|
+
}
|
600
|
+
|
601
|
+
return return_value;
|
602
|
+
}
|
603
|
+
|
604
|
+
// Calculate VAD decision by first extracting feature values and then calculate
|
605
|
+
// probability for both speech and background noise.
|
606
|
+
|
607
|
+
int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
|
608
|
+
size_t frame_length) {
|
609
|
+
int vad;
|
610
|
+
size_t i;
|
611
|
+
int16_t speech_nb[240]; // 30 ms in 8 kHz.
|
612
|
+
// |tmp_mem| is a temporary memory used by resample function, length is
|
613
|
+
// frame length in 10 ms (480 samples) + 256 extra.
|
614
|
+
int32_t tmp_mem[480 + 256] = { 0 };
|
615
|
+
const size_t kFrameLen10ms48khz = 480;
|
616
|
+
const size_t kFrameLen10ms8khz = 80;
|
617
|
+
size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
|
618
|
+
|
619
|
+
for (i = 0; i < num_10ms_frames; i++) {
|
620
|
+
WebRtcSpl_Resample48khzTo8khz(speech_frame,
|
621
|
+
&speech_nb[i * kFrameLen10ms8khz],
|
622
|
+
&inst->state_48_to_8,
|
623
|
+
tmp_mem);
|
624
|
+
}
|
625
|
+
|
626
|
+
// Do VAD on an 8 kHz signal
|
627
|
+
vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
|
628
|
+
|
629
|
+
return vad;
|
630
|
+
}
|
631
|
+
|
632
|
+
int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
|
633
|
+
size_t frame_length)
|
634
|
+
{
|
635
|
+
size_t len;
|
636
|
+
int vad;
|
637
|
+
int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
|
638
|
+
int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
|
639
|
+
|
640
|
+
|
641
|
+
// Downsample signal 32->16->8 before doing VAD
|
642
|
+
WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
|
643
|
+
frame_length);
|
644
|
+
len = frame_length / 2;
|
645
|
+
|
646
|
+
WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
|
647
|
+
len /= 2;
|
648
|
+
|
649
|
+
// Do VAD on an 8 kHz signal
|
650
|
+
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
|
651
|
+
|
652
|
+
return vad;
|
653
|
+
}
|
654
|
+
|
655
|
+
int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
|
656
|
+
size_t frame_length)
|
657
|
+
{
|
658
|
+
size_t len;
|
659
|
+
int vad;
|
660
|
+
int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
|
661
|
+
|
662
|
+
// Wideband: Downsample signal before doing VAD
|
663
|
+
WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
|
664
|
+
frame_length);
|
665
|
+
|
666
|
+
len = frame_length / 2;
|
667
|
+
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
|
668
|
+
|
669
|
+
return vad;
|
670
|
+
}
|
671
|
+
|
672
|
+
int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
|
673
|
+
size_t frame_length)
|
674
|
+
{
|
675
|
+
int16_t feature_vector[kNumChannels], total_power;
|
676
|
+
|
677
|
+
// Get power in the bands
|
678
|
+
total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
|
679
|
+
feature_vector);
|
680
|
+
|
681
|
+
// Make a VAD
|
682
|
+
inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
|
683
|
+
|
684
|
+
return inst->vad;
|
685
|
+
}
|