webrtcvad 0.1.0 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/webrtcvad/extconf.rb +29 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/division_operations.c +141 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/dot_product_with_scale.h +40 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/energy.c +39 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/get_scaling_square.c +46 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/signal_processing_library.h +1605 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/spl_inl.h +153 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_48khz.c +186 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.c +689 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.h +60 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_fractional.c +239 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c +77 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h +29 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_mips.c +207 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/include/webrtc_vad.h +87 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.c +685 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.h +114 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.c +329 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.h +45 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.c +82 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.h +39 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.c +176 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.h +54 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/webrtc_vad.c +114 -0
- data/ext/webrtcvad/webrtc/rtc_base/checks.cc +207 -0
- data/ext/webrtcvad/webrtc/rtc_base/checks.h +400 -0
- data/ext/webrtcvad/webrtc/rtc_base/compile_assert_c.h +25 -0
- data/ext/webrtcvad/webrtc/rtc_base/numerics/safe_compare.h +176 -0
- data/ext/webrtcvad/webrtc/rtc_base/sanitizer.h +144 -0
- data/ext/webrtcvad/webrtc/rtc_base/system/inline.h +31 -0
- data/ext/webrtcvad/webrtc/rtc_base/system/rtc_export.h +43 -0
- data/ext/webrtcvad/webrtc/rtc_base/type_traits.h +140 -0
- data/ext/webrtcvad/webrtcvad.c +112 -0
- metadata +37 -3
@@ -0,0 +1,114 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license
|
5
|
+
* that can be found in the LICENSE file in the root of the source
|
6
|
+
* tree. An additional intellectual property rights grant can be found
|
7
|
+
* in the file PATENTS. All contributing project authors may
|
8
|
+
* be found in the AUTHORS file in the root of the source tree.
|
9
|
+
*/
|
10
|
+
|
11
|
+
/*
|
12
|
+
* This header file includes the descriptions of the core VAD calls.
|
13
|
+
*/
|
14
|
+
|
15
|
+
#ifndef COMMON_AUDIO_VAD_VAD_CORE_H_
|
16
|
+
#define COMMON_AUDIO_VAD_VAD_CORE_H_
|
17
|
+
|
18
|
+
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
19
|
+
|
20
|
+
enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
|
21
|
+
enum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM.
|
22
|
+
enum { kTableSize = kNumChannels * kNumGaussians };
|
23
|
+
enum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal.
|
24
|
+
|
25
|
+
typedef struct VadInstT_ {
|
26
|
+
int vad;
|
27
|
+
int32_t downsampling_filter_states[4];
|
28
|
+
WebRtcSpl_State48khzTo8khz state_48_to_8;
|
29
|
+
int16_t noise_means[kTableSize];
|
30
|
+
int16_t speech_means[kTableSize];
|
31
|
+
int16_t noise_stds[kTableSize];
|
32
|
+
int16_t speech_stds[kTableSize];
|
33
|
+
// TODO(bjornv): Change to |frame_count|.
|
34
|
+
int32_t frame_counter;
|
35
|
+
int16_t over_hang; // Over Hang
|
36
|
+
int16_t num_of_speech;
|
37
|
+
// TODO(bjornv): Change to |age_vector|.
|
38
|
+
int16_t index_vector[16 * kNumChannels];
|
39
|
+
int16_t low_value_vector[16 * kNumChannels];
|
40
|
+
// TODO(bjornv): Change to |median|.
|
41
|
+
int16_t mean_value[kNumChannels];
|
42
|
+
int16_t upper_state[5];
|
43
|
+
int16_t lower_state[5];
|
44
|
+
int16_t hp_filter_state[4];
|
45
|
+
int16_t over_hang_max_1[3];
|
46
|
+
int16_t over_hang_max_2[3];
|
47
|
+
int16_t individual[3];
|
48
|
+
int16_t total[3];
|
49
|
+
|
50
|
+
int init_flag;
|
51
|
+
} VadInstT;
|
52
|
+
|
53
|
+
// Initializes the core VAD component. The default aggressiveness mode is
|
54
|
+
// controlled by |kDefaultMode| in vad_core.c.
|
55
|
+
//
|
56
|
+
// - self [i/o] : Instance that should be initialized
|
57
|
+
//
|
58
|
+
// returns : 0 (OK), -1 (null pointer in or if the default mode can't be
|
59
|
+
// set)
|
60
|
+
int WebRtcVad_InitCore(VadInstT* self);
|
61
|
+
|
62
|
+
/****************************************************************************
|
63
|
+
* WebRtcVad_set_mode_core(...)
|
64
|
+
*
|
65
|
+
* This function changes the VAD settings
|
66
|
+
*
|
67
|
+
* Input:
|
68
|
+
* - inst : VAD instance
|
69
|
+
* - mode : Aggressiveness degree
|
70
|
+
* 0 (High quality) - 3 (Highly aggressive)
|
71
|
+
*
|
72
|
+
* Output:
|
73
|
+
* - inst : Changed instance
|
74
|
+
*
|
75
|
+
* Return value : 0 - Ok
|
76
|
+
* -1 - Error
|
77
|
+
*/
|
78
|
+
|
79
|
+
int WebRtcVad_set_mode_core(VadInstT* self, int mode);
|
80
|
+
|
81
|
+
/****************************************************************************
|
82
|
+
* WebRtcVad_CalcVad48khz(...)
|
83
|
+
* WebRtcVad_CalcVad32khz(...)
|
84
|
+
* WebRtcVad_CalcVad16khz(...)
|
85
|
+
* WebRtcVad_CalcVad8khz(...)
|
86
|
+
*
|
87
|
+
* Calculate probability for active speech and make VAD decision.
|
88
|
+
*
|
89
|
+
* Input:
|
90
|
+
* - inst : Instance that should be initialized
|
91
|
+
* - speech_frame : Input speech frame
|
92
|
+
* - frame_length : Number of input samples
|
93
|
+
*
|
94
|
+
* Output:
|
95
|
+
* - inst : Updated filter states etc.
|
96
|
+
*
|
97
|
+
* Return value : VAD decision
|
98
|
+
* 0 - No active speech
|
99
|
+
* 1-6 - Active speech
|
100
|
+
*/
|
101
|
+
int WebRtcVad_CalcVad48khz(VadInstT* inst,
|
102
|
+
const int16_t* speech_frame,
|
103
|
+
size_t frame_length);
|
104
|
+
int WebRtcVad_CalcVad32khz(VadInstT* inst,
|
105
|
+
const int16_t* speech_frame,
|
106
|
+
size_t frame_length);
|
107
|
+
int WebRtcVad_CalcVad16khz(VadInstT* inst,
|
108
|
+
const int16_t* speech_frame,
|
109
|
+
size_t frame_length);
|
110
|
+
int WebRtcVad_CalcVad8khz(VadInstT* inst,
|
111
|
+
const int16_t* speech_frame,
|
112
|
+
size_t frame_length);
|
113
|
+
|
114
|
+
#endif // COMMON_AUDIO_VAD_VAD_CORE_H_
|
@@ -0,0 +1,329 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license
|
5
|
+
* that can be found in the LICENSE file in the root of the source
|
6
|
+
* tree. An additional intellectual property rights grant can be found
|
7
|
+
* in the file PATENTS. All contributing project authors may
|
8
|
+
* be found in the AUTHORS file in the root of the source tree.
|
9
|
+
*/
|
10
|
+
|
11
|
+
#include "common_audio/vad/vad_filterbank.h"
|
12
|
+
|
13
|
+
#include "rtc_base/checks.h"
|
14
|
+
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
15
|
+
|
16
|
+
// Constants used in LogOfEnergy().
|
17
|
+
static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
|
18
|
+
static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10
|
19
|
+
|
20
|
+
// Coefficients used by HighPassFilter, Q14.
|
21
|
+
static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
|
22
|
+
static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 };
|
23
|
+
|
24
|
+
// Allpass filter coefficients, upper and lower, in Q15.
|
25
|
+
// Upper: 0.64, Lower: 0.17
|
26
|
+
static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };
|
27
|
+
|
28
|
+
// Adjustment for division with two in SplitFilter.
|
29
|
+
static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };
|
30
|
+
|
31
|
+
// High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
|
32
|
+
// sampled at 500 Hz.
|
33
|
+
//
|
34
|
+
// - data_in [i] : Input audio data sampled at 500 Hz.
|
35
|
+
// - data_length [i] : Length of input and output data.
|
36
|
+
// - filter_state [i/o] : State of the filter.
|
37
|
+
// - data_out [o] : Output audio data in the frequency interval
|
38
|
+
// 80 - 250 Hz.
|
39
|
+
static void HighPassFilter(const int16_t* data_in, size_t data_length,
|
40
|
+
int16_t* filter_state, int16_t* data_out) {
|
41
|
+
size_t i;
|
42
|
+
const int16_t* in_ptr = data_in;
|
43
|
+
int16_t* out_ptr = data_out;
|
44
|
+
int32_t tmp32 = 0;
|
45
|
+
|
46
|
+
|
47
|
+
// The sum of the absolute values of the impulse response:
|
48
|
+
// The zero/pole-filter has a max amplification of a single sample of: 1.4546
|
49
|
+
// Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194
|
50
|
+
// The all-zero section has a max amplification of a single sample of: 1.6189
|
51
|
+
// Impulse response: 0.4047 -0.8094 0.4047 0 0 0
|
52
|
+
// The all-pole section has a max amplification of a single sample of: 1.9931
|
53
|
+
// Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532
|
54
|
+
|
55
|
+
for (i = 0; i < data_length; i++) {
|
56
|
+
// All-zero section (filter coefficients in Q14).
|
57
|
+
tmp32 = kHpZeroCoefs[0] * *in_ptr;
|
58
|
+
tmp32 += kHpZeroCoefs[1] * filter_state[0];
|
59
|
+
tmp32 += kHpZeroCoefs[2] * filter_state[1];
|
60
|
+
filter_state[1] = filter_state[0];
|
61
|
+
filter_state[0] = *in_ptr++;
|
62
|
+
|
63
|
+
// All-pole section (filter coefficients in Q14).
|
64
|
+
tmp32 -= kHpPoleCoefs[1] * filter_state[2];
|
65
|
+
tmp32 -= kHpPoleCoefs[2] * filter_state[3];
|
66
|
+
filter_state[3] = filter_state[2];
|
67
|
+
filter_state[2] = (int16_t) (tmp32 >> 14);
|
68
|
+
*out_ptr++ = filter_state[2];
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
// All pass filtering of |data_in|, used before splitting the signal into two
|
73
|
+
// frequency bands (low pass vs high pass).
|
74
|
+
// Note that |data_in| and |data_out| can NOT correspond to the same address.
|
75
|
+
//
|
76
|
+
// - data_in [i] : Input audio signal given in Q0.
|
77
|
+
// - data_length [i] : Length of input and output data.
|
78
|
+
// - filter_coefficient [i] : Given in Q15.
|
79
|
+
// - filter_state [i/o] : State of the filter given in Q(-1).
|
80
|
+
// - data_out [o] : Output audio signal given in Q(-1).
|
81
|
+
static void AllPassFilter(const int16_t* data_in, size_t data_length,
|
82
|
+
int16_t filter_coefficient, int16_t* filter_state,
|
83
|
+
int16_t* data_out) {
|
84
|
+
// The filter can only cause overflow (in the w16 output variable)
|
85
|
+
// if more than 4 consecutive input numbers are of maximum value and
|
86
|
+
// has the the same sign as the impulse responses first taps.
|
87
|
+
// First 6 taps of the impulse response:
|
88
|
+
// 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
|
89
|
+
|
90
|
+
size_t i;
|
91
|
+
int16_t tmp16 = 0;
|
92
|
+
int32_t tmp32 = 0;
|
93
|
+
int32_t state32 = ((int32_t) (*filter_state) * (1 << 16)); // Q15
|
94
|
+
|
95
|
+
for (i = 0; i < data_length; i++) {
|
96
|
+
tmp32 = state32 + filter_coefficient * *data_in;
|
97
|
+
tmp16 = (int16_t) (tmp32 >> 16); // Q(-1)
|
98
|
+
*data_out++ = tmp16;
|
99
|
+
state32 = (*data_in * (1 << 14)) - filter_coefficient * tmp16; // Q14
|
100
|
+
state32 *= 2; // Q15.
|
101
|
+
data_in += 2;
|
102
|
+
}
|
103
|
+
|
104
|
+
*filter_state = (int16_t) (state32 >> 16); // Q(-1)
|
105
|
+
}
|
106
|
+
|
107
|
+
// Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
|
108
|
+
// an upper (high pass) part and a lower (low pass) part respectively.
|
109
|
+
//
|
110
|
+
// - data_in [i] : Input audio data to be split into two frequency bands.
|
111
|
+
// - data_length [i] : Length of |data_in|.
|
112
|
+
// - upper_state [i/o] : State of the upper filter, given in Q(-1).
|
113
|
+
// - lower_state [i/o] : State of the lower filter, given in Q(-1).
|
114
|
+
// - hp_data_out [o] : Output audio data of the upper half of the spectrum.
|
115
|
+
// The length is |data_length| / 2.
|
116
|
+
// - lp_data_out [o] : Output audio data of the lower half of the spectrum.
|
117
|
+
// The length is |data_length| / 2.
|
118
|
+
static void SplitFilter(const int16_t* data_in, size_t data_length,
|
119
|
+
int16_t* upper_state, int16_t* lower_state,
|
120
|
+
int16_t* hp_data_out, int16_t* lp_data_out) {
|
121
|
+
size_t i;
|
122
|
+
size_t half_length = data_length >> 1; // Downsampling by 2.
|
123
|
+
int16_t tmp_out;
|
124
|
+
|
125
|
+
// All-pass filtering upper branch.
|
126
|
+
AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
|
127
|
+
hp_data_out);
|
128
|
+
|
129
|
+
// All-pass filtering lower branch.
|
130
|
+
AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
|
131
|
+
lp_data_out);
|
132
|
+
|
133
|
+
// Make LP and HP signals.
|
134
|
+
for (i = 0; i < half_length; i++) {
|
135
|
+
tmp_out = *hp_data_out;
|
136
|
+
*hp_data_out++ -= *lp_data_out;
|
137
|
+
*lp_data_out++ += tmp_out;
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
// Calculates the energy of |data_in| in dB, and also updates an overall
|
142
|
+
// |total_energy| if necessary.
|
143
|
+
//
|
144
|
+
// - data_in [i] : Input audio data for energy calculation.
|
145
|
+
// - data_length [i] : Length of input data.
|
146
|
+
// - offset [i] : Offset value added to |log_energy|.
|
147
|
+
// - total_energy [i/o] : An external energy updated with the energy of
|
148
|
+
// |data_in|.
|
149
|
+
// NOTE: |total_energy| is only updated if
|
150
|
+
// |total_energy| <= |kMinEnergy|.
|
151
|
+
// - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
|
152
|
+
static void LogOfEnergy(const int16_t* data_in, size_t data_length,
|
153
|
+
int16_t offset, int16_t* total_energy,
|
154
|
+
int16_t* log_energy) {
|
155
|
+
// |tot_rshifts| accumulates the number of right shifts performed on |energy|.
|
156
|
+
int tot_rshifts = 0;
|
157
|
+
// The |energy| will be normalized to 15 bits. We use unsigned integer because
|
158
|
+
// we eventually will mask out the fractional part.
|
159
|
+
uint32_t energy = 0;
|
160
|
+
|
161
|
+
RTC_DCHECK(data_in);
|
162
|
+
RTC_DCHECK_GT(data_length, 0);
|
163
|
+
|
164
|
+
energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
|
165
|
+
&tot_rshifts);
|
166
|
+
|
167
|
+
if (energy != 0) {
|
168
|
+
// By construction, normalizing to 15 bits is equivalent with 17 leading
|
169
|
+
// zeros of an unsigned 32 bit value.
|
170
|
+
int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
|
171
|
+
// In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
|
172
|
+
// (14 << 10), which is what we initialize |log2_energy| with. For a more
|
173
|
+
// detailed derivations, see below.
|
174
|
+
int16_t log2_energy = kLogEnergyIntPart;
|
175
|
+
|
176
|
+
tot_rshifts += normalizing_rshifts;
|
177
|
+
// Normalize |energy| to 15 bits.
|
178
|
+
// |tot_rshifts| is now the total number of right shifts performed on
|
179
|
+
// |energy| after normalization. This means that |energy| is in
|
180
|
+
// Q(-tot_rshifts).
|
181
|
+
if (normalizing_rshifts < 0) {
|
182
|
+
energy <<= -normalizing_rshifts;
|
183
|
+
} else {
|
184
|
+
energy >>= normalizing_rshifts;
|
185
|
+
}
|
186
|
+
|
187
|
+
// Calculate the energy of |data_in| in dB, in Q4.
|
188
|
+
//
|
189
|
+
// 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
|
190
|
+
// 160 * log10(|energy| * 2^|tot_rshifts|) =
|
191
|
+
// 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
|
192
|
+
// 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
|
193
|
+
// (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
|
194
|
+
// |kLogConst| * (|log2_energy| + |tot_rshifts|)
|
195
|
+
//
|
196
|
+
// We know by construction that |energy| is normalized to 15 bits. Hence,
|
197
|
+
// |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
|
198
|
+
// Further, we'd like |log2_energy| in Q10
|
199
|
+
// log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
|
200
|
+
// 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
|
201
|
+
// 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
|
202
|
+
// (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
|
203
|
+
// (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
|
204
|
+
//
|
205
|
+
// Note that frac_Q15 = (|energy| & 0x00003FFF)
|
206
|
+
|
207
|
+
// Calculate and add the fractional part to |log2_energy|.
|
208
|
+
log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);
|
209
|
+
|
210
|
+
// |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
|
211
|
+
// Note that we in our derivation above have accounted for an output in Q4.
|
212
|
+
*log_energy = (int16_t)(((kLogConst * log2_energy) >> 19) +
|
213
|
+
((tot_rshifts * kLogConst) >> 9));
|
214
|
+
|
215
|
+
if (*log_energy < 0) {
|
216
|
+
*log_energy = 0;
|
217
|
+
}
|
218
|
+
} else {
|
219
|
+
*log_energy = offset;
|
220
|
+
return;
|
221
|
+
}
|
222
|
+
|
223
|
+
*log_energy += offset;
|
224
|
+
|
225
|
+
// Update the approximate |total_energy| with the energy of |data_in|, if
|
226
|
+
// |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
|
227
|
+
// energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
|
228
|
+
if (*total_energy <= kMinEnergy) {
|
229
|
+
if (tot_rshifts >= 0) {
|
230
|
+
// We know by construction that the |energy| > |kMinEnergy| in Q0, so add
|
231
|
+
// an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
|
232
|
+
*total_energy += kMinEnergy + 1;
|
233
|
+
} else {
|
234
|
+
// By construction |energy| is represented by 15 bits, hence any number of
|
235
|
+
// right shifted |energy| will fit in an int16_t. In addition, adding the
|
236
|
+
// value to |total_energy| is wrap around safe as long as
|
237
|
+
// |kMinEnergy| < 8192.
|
238
|
+
*total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
|
244
|
+
size_t data_length, int16_t* features) {
|
245
|
+
int16_t total_energy = 0;
|
246
|
+
// We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
|
247
|
+
// 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
|
248
|
+
// have at most 120 samples after the first split and at most 60 samples after
|
249
|
+
// the second split.
|
250
|
+
int16_t hp_120[120], lp_120[120];
|
251
|
+
int16_t hp_60[60], lp_60[60];
|
252
|
+
const size_t half_data_length = data_length >> 1;
|
253
|
+
size_t length = half_data_length; // |data_length| / 2, corresponds to
|
254
|
+
// bandwidth = 2000 Hz after downsampling.
|
255
|
+
|
256
|
+
// Initialize variables for the first SplitFilter().
|
257
|
+
int frequency_band = 0;
|
258
|
+
const int16_t* in_ptr = data_in; // [0 - 4000] Hz.
|
259
|
+
int16_t* hp_out_ptr = hp_120; // [2000 - 4000] Hz.
|
260
|
+
int16_t* lp_out_ptr = lp_120; // [0 - 2000] Hz.
|
261
|
+
|
262
|
+
RTC_DCHECK_LE(data_length, 240);
|
263
|
+
RTC_DCHECK_LT(4, kNumChannels - 1); // Checking maximum |frequency_band|.
|
264
|
+
|
265
|
+
// Split at 2000 Hz and downsample.
|
266
|
+
SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
|
267
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
268
|
+
|
269
|
+
// For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
|
270
|
+
frequency_band = 1;
|
271
|
+
in_ptr = hp_120; // [2000 - 4000] Hz.
|
272
|
+
hp_out_ptr = hp_60; // [3000 - 4000] Hz.
|
273
|
+
lp_out_ptr = lp_60; // [2000 - 3000] Hz.
|
274
|
+
SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
|
275
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
276
|
+
|
277
|
+
// Energy in 3000 Hz - 4000 Hz.
|
278
|
+
length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
|
279
|
+
|
280
|
+
LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);
|
281
|
+
|
282
|
+
// Energy in 2000 Hz - 3000 Hz.
|
283
|
+
LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);
|
284
|
+
|
285
|
+
// For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
|
286
|
+
frequency_band = 2;
|
287
|
+
in_ptr = lp_120; // [0 - 2000] Hz.
|
288
|
+
hp_out_ptr = hp_60; // [1000 - 2000] Hz.
|
289
|
+
lp_out_ptr = lp_60; // [0 - 1000] Hz.
|
290
|
+
length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz.
|
291
|
+
SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
|
292
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
293
|
+
|
294
|
+
// Energy in 1000 Hz - 2000 Hz.
|
295
|
+
length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
|
296
|
+
LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);
|
297
|
+
|
298
|
+
// For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
|
299
|
+
frequency_band = 3;
|
300
|
+
in_ptr = lp_60; // [0 - 1000] Hz.
|
301
|
+
hp_out_ptr = hp_120; // [500 - 1000] Hz.
|
302
|
+
lp_out_ptr = lp_120; // [0 - 500] Hz.
|
303
|
+
SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
|
304
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
305
|
+
|
306
|
+
// Energy in 500 Hz - 1000 Hz.
|
307
|
+
length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz.
|
308
|
+
LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);
|
309
|
+
|
310
|
+
// For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
|
311
|
+
frequency_band = 4;
|
312
|
+
in_ptr = lp_120; // [0 - 500] Hz.
|
313
|
+
hp_out_ptr = hp_60; // [250 - 500] Hz.
|
314
|
+
lp_out_ptr = lp_60; // [0 - 250] Hz.
|
315
|
+
SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
|
316
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
317
|
+
|
318
|
+
// Energy in 250 Hz - 500 Hz.
|
319
|
+
length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz.
|
320
|
+
LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);
|
321
|
+
|
322
|
+
// Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
|
323
|
+
HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);
|
324
|
+
|
325
|
+
// Energy in 80 Hz - 250 Hz.
|
326
|
+
LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);
|
327
|
+
|
328
|
+
return total_energy;
|
329
|
+
}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license
|
5
|
+
* that can be found in the LICENSE file in the root of the source
|
6
|
+
* tree. An additional intellectual property rights grant can be found
|
7
|
+
* in the file PATENTS. All contributing project authors may
|
8
|
+
* be found in the AUTHORS file in the root of the source tree.
|
9
|
+
*/
|
10
|
+
|
11
|
+
/*
|
12
|
+
* This file includes feature calculating functionality used in vad_core.c.
|
13
|
+
*/
|
14
|
+
|
15
|
+
#ifndef COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
|
16
|
+
#define COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
|
17
|
+
|
18
|
+
#include "common_audio/vad/vad_core.h"
|
19
|
+
|
20
|
+
// Takes |data_length| samples of |data_in| and calculates the logarithm of the
|
21
|
+
// energy of each of the |kNumChannels| = 6 frequency bands used by the VAD:
|
22
|
+
// 80 Hz - 250 Hz
|
23
|
+
// 250 Hz - 500 Hz
|
24
|
+
// 500 Hz - 1000 Hz
|
25
|
+
// 1000 Hz - 2000 Hz
|
26
|
+
// 2000 Hz - 3000 Hz
|
27
|
+
// 3000 Hz - 4000 Hz
|
28
|
+
//
|
29
|
+
// The values are given in Q4 and written to |features|. Further, an approximate
|
30
|
+
// overall energy is returned. The return value is used in
|
31
|
+
// WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above
|
32
|
+
// the threshold |kMinEnergy|.
|
33
|
+
//
|
34
|
+
// - self [i/o] : State information of the VAD.
|
35
|
+
// - data_in [i] : Input audio data, for feature extraction.
|
36
|
+
// - data_length [i] : Audio data size, in number of samples.
|
37
|
+
// - features [o] : 10 * log10(energy in each frequency band), Q4.
|
38
|
+
// - returns : Total energy of the signal (NOTE! This value is not
|
39
|
+
// exact. It is only used in a comparison.)
|
40
|
+
int16_t WebRtcVad_CalculateFeatures(VadInstT* self,
|
41
|
+
const int16_t* data_in,
|
42
|
+
size_t data_length,
|
43
|
+
int16_t* features);
|
44
|
+
|
45
|
+
#endif // COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
|