webrtcvad 0.1.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/webrtcvad/extconf.rb +29 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/division_operations.c +141 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/dot_product_with_scale.h +40 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/energy.c +39 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/get_scaling_square.c +46 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/signal_processing_library.h +1605 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/spl_inl.h +153 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_48khz.c +186 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.c +689 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.h +60 -0
- data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_fractional.c +239 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c +77 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h +29 -0
- data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_mips.c +207 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/include/webrtc_vad.h +87 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.c +685 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.h +114 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.c +329 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.h +45 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.c +82 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.h +39 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.c +176 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.h +54 -0
- data/ext/webrtcvad/webrtc/common_audio/vad/webrtc_vad.c +114 -0
- data/ext/webrtcvad/webrtc/rtc_base/checks.cc +207 -0
- data/ext/webrtcvad/webrtc/rtc_base/checks.h +400 -0
- data/ext/webrtcvad/webrtc/rtc_base/compile_assert_c.h +25 -0
- data/ext/webrtcvad/webrtc/rtc_base/numerics/safe_compare.h +176 -0
- data/ext/webrtcvad/webrtc/rtc_base/sanitizer.h +144 -0
- data/ext/webrtcvad/webrtc/rtc_base/system/inline.h +31 -0
- data/ext/webrtcvad/webrtc/rtc_base/system/rtc_export.h +43 -0
- data/ext/webrtcvad/webrtc/rtc_base/type_traits.h +140 -0
- data/ext/webrtcvad/webrtcvad.c +112 -0
- metadata +37 -3
@@ -0,0 +1,114 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license
|
5
|
+
* that can be found in the LICENSE file in the root of the source
|
6
|
+
* tree. An additional intellectual property rights grant can be found
|
7
|
+
* in the file PATENTS. All contributing project authors may
|
8
|
+
* be found in the AUTHORS file in the root of the source tree.
|
9
|
+
*/
|
10
|
+
|
11
|
+
/*
|
12
|
+
* This header file includes the descriptions of the core VAD calls.
|
13
|
+
*/
|
14
|
+
|
15
|
+
#ifndef COMMON_AUDIO_VAD_VAD_CORE_H_
|
16
|
+
#define COMMON_AUDIO_VAD_VAD_CORE_H_
|
17
|
+
|
18
|
+
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
19
|
+
|
20
|
+
enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
|
21
|
+
enum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM.
|
22
|
+
enum { kTableSize = kNumChannels * kNumGaussians };
|
23
|
+
enum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal.
|
24
|
+
|
25
|
+
typedef struct VadInstT_ {
|
26
|
+
int vad;
|
27
|
+
int32_t downsampling_filter_states[4];
|
28
|
+
WebRtcSpl_State48khzTo8khz state_48_to_8;
|
29
|
+
int16_t noise_means[kTableSize];
|
30
|
+
int16_t speech_means[kTableSize];
|
31
|
+
int16_t noise_stds[kTableSize];
|
32
|
+
int16_t speech_stds[kTableSize];
|
33
|
+
// TODO(bjornv): Change to |frame_count|.
|
34
|
+
int32_t frame_counter;
|
35
|
+
int16_t over_hang; // Over Hang
|
36
|
+
int16_t num_of_speech;
|
37
|
+
// TODO(bjornv): Change to |age_vector|.
|
38
|
+
int16_t index_vector[16 * kNumChannels];
|
39
|
+
int16_t low_value_vector[16 * kNumChannels];
|
40
|
+
// TODO(bjornv): Change to |median|.
|
41
|
+
int16_t mean_value[kNumChannels];
|
42
|
+
int16_t upper_state[5];
|
43
|
+
int16_t lower_state[5];
|
44
|
+
int16_t hp_filter_state[4];
|
45
|
+
int16_t over_hang_max_1[3];
|
46
|
+
int16_t over_hang_max_2[3];
|
47
|
+
int16_t individual[3];
|
48
|
+
int16_t total[3];
|
49
|
+
|
50
|
+
int init_flag;
|
51
|
+
} VadInstT;
|
52
|
+
|
53
|
+
// Initializes the core VAD component. The default aggressiveness mode is
|
54
|
+
// controlled by |kDefaultMode| in vad_core.c.
|
55
|
+
//
|
56
|
+
// - self [i/o] : Instance that should be initialized
|
57
|
+
//
|
58
|
+
// returns : 0 (OK), -1 (null pointer in or if the default mode can't be
|
59
|
+
// set)
|
60
|
+
int WebRtcVad_InitCore(VadInstT* self);
|
61
|
+
|
62
|
+
/****************************************************************************
|
63
|
+
* WebRtcVad_set_mode_core(...)
|
64
|
+
*
|
65
|
+
* This function changes the VAD settings
|
66
|
+
*
|
67
|
+
* Input:
|
68
|
+
* - inst : VAD instance
|
69
|
+
* - mode : Aggressiveness degree
|
70
|
+
* 0 (High quality) - 3 (Highly aggressive)
|
71
|
+
*
|
72
|
+
* Output:
|
73
|
+
* - inst : Changed instance
|
74
|
+
*
|
75
|
+
* Return value : 0 - Ok
|
76
|
+
* -1 - Error
|
77
|
+
*/
|
78
|
+
|
79
|
+
int WebRtcVad_set_mode_core(VadInstT* self, int mode);
|
80
|
+
|
81
|
+
/****************************************************************************
|
82
|
+
* WebRtcVad_CalcVad48khz(...)
|
83
|
+
* WebRtcVad_CalcVad32khz(...)
|
84
|
+
* WebRtcVad_CalcVad16khz(...)
|
85
|
+
* WebRtcVad_CalcVad8khz(...)
|
86
|
+
*
|
87
|
+
* Calculate probability for active speech and make VAD decision.
|
88
|
+
*
|
89
|
+
* Input:
|
90
|
+
* - inst : Instance that should be initialized
|
91
|
+
* - speech_frame : Input speech frame
|
92
|
+
* - frame_length : Number of input samples
|
93
|
+
*
|
94
|
+
* Output:
|
95
|
+
* - inst : Updated filter states etc.
|
96
|
+
*
|
97
|
+
* Return value : VAD decision
|
98
|
+
* 0 - No active speech
|
99
|
+
* 1-6 - Active speech
|
100
|
+
*/
|
101
|
+
int WebRtcVad_CalcVad48khz(VadInstT* inst,
|
102
|
+
const int16_t* speech_frame,
|
103
|
+
size_t frame_length);
|
104
|
+
int WebRtcVad_CalcVad32khz(VadInstT* inst,
|
105
|
+
const int16_t* speech_frame,
|
106
|
+
size_t frame_length);
|
107
|
+
int WebRtcVad_CalcVad16khz(VadInstT* inst,
|
108
|
+
const int16_t* speech_frame,
|
109
|
+
size_t frame_length);
|
110
|
+
int WebRtcVad_CalcVad8khz(VadInstT* inst,
|
111
|
+
const int16_t* speech_frame,
|
112
|
+
size_t frame_length);
|
113
|
+
|
114
|
+
#endif // COMMON_AUDIO_VAD_VAD_CORE_H_
|
@@ -0,0 +1,329 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license
|
5
|
+
* that can be found in the LICENSE file in the root of the source
|
6
|
+
* tree. An additional intellectual property rights grant can be found
|
7
|
+
* in the file PATENTS. All contributing project authors may
|
8
|
+
* be found in the AUTHORS file in the root of the source tree.
|
9
|
+
*/
|
10
|
+
|
11
|
+
#include "common_audio/vad/vad_filterbank.h"
|
12
|
+
|
13
|
+
#include "rtc_base/checks.h"
|
14
|
+
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
15
|
+
|
16
|
+
// Constants used in LogOfEnergy().
|
17
|
+
static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
|
18
|
+
static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10
|
19
|
+
|
20
|
+
// Coefficients used by HighPassFilter, Q14.
|
21
|
+
static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
|
22
|
+
static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 };
|
23
|
+
|
24
|
+
// Allpass filter coefficients, upper and lower, in Q15.
|
25
|
+
// Upper: 0.64, Lower: 0.17
|
26
|
+
static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };
|
27
|
+
|
28
|
+
// Adjustment for division with two in SplitFilter.
|
29
|
+
static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };
|
30
|
+
|
31
|
+
// High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
|
32
|
+
// sampled at 500 Hz.
|
33
|
+
//
|
34
|
+
// - data_in [i] : Input audio data sampled at 500 Hz.
|
35
|
+
// - data_length [i] : Length of input and output data.
|
36
|
+
// - filter_state [i/o] : State of the filter.
|
37
|
+
// - data_out [o] : Output audio data in the frequency interval
|
38
|
+
// 80 - 250 Hz.
|
39
|
+
static void HighPassFilter(const int16_t* data_in, size_t data_length,
|
40
|
+
int16_t* filter_state, int16_t* data_out) {
|
41
|
+
size_t i;
|
42
|
+
const int16_t* in_ptr = data_in;
|
43
|
+
int16_t* out_ptr = data_out;
|
44
|
+
int32_t tmp32 = 0;
|
45
|
+
|
46
|
+
|
47
|
+
// The sum of the absolute values of the impulse response:
|
48
|
+
// The zero/pole-filter has a max amplification of a single sample of: 1.4546
|
49
|
+
// Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194
|
50
|
+
// The all-zero section has a max amplification of a single sample of: 1.6189
|
51
|
+
// Impulse response: 0.4047 -0.8094 0.4047 0 0 0
|
52
|
+
// The all-pole section has a max amplification of a single sample of: 1.9931
|
53
|
+
// Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532
|
54
|
+
|
55
|
+
for (i = 0; i < data_length; i++) {
|
56
|
+
// All-zero section (filter coefficients in Q14).
|
57
|
+
tmp32 = kHpZeroCoefs[0] * *in_ptr;
|
58
|
+
tmp32 += kHpZeroCoefs[1] * filter_state[0];
|
59
|
+
tmp32 += kHpZeroCoefs[2] * filter_state[1];
|
60
|
+
filter_state[1] = filter_state[0];
|
61
|
+
filter_state[0] = *in_ptr++;
|
62
|
+
|
63
|
+
// All-pole section (filter coefficients in Q14).
|
64
|
+
tmp32 -= kHpPoleCoefs[1] * filter_state[2];
|
65
|
+
tmp32 -= kHpPoleCoefs[2] * filter_state[3];
|
66
|
+
filter_state[3] = filter_state[2];
|
67
|
+
filter_state[2] = (int16_t) (tmp32 >> 14);
|
68
|
+
*out_ptr++ = filter_state[2];
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
// All pass filtering of |data_in|, used before splitting the signal into two
|
73
|
+
// frequency bands (low pass vs high pass).
|
74
|
+
// Note that |data_in| and |data_out| can NOT correspond to the same address.
|
75
|
+
//
|
76
|
+
// - data_in [i] : Input audio signal given in Q0.
|
77
|
+
// - data_length [i] : Length of input and output data.
|
78
|
+
// - filter_coefficient [i] : Given in Q15.
|
79
|
+
// - filter_state [i/o] : State of the filter given in Q(-1).
|
80
|
+
// - data_out [o] : Output audio signal given in Q(-1).
|
81
|
+
static void AllPassFilter(const int16_t* data_in, size_t data_length,
|
82
|
+
int16_t filter_coefficient, int16_t* filter_state,
|
83
|
+
int16_t* data_out) {
|
84
|
+
// The filter can only cause overflow (in the w16 output variable)
|
85
|
+
// if more than 4 consecutive input numbers are of maximum value and
|
86
|
+
// has the the same sign as the impulse responses first taps.
|
87
|
+
// First 6 taps of the impulse response:
|
88
|
+
// 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
|
89
|
+
|
90
|
+
size_t i;
|
91
|
+
int16_t tmp16 = 0;
|
92
|
+
int32_t tmp32 = 0;
|
93
|
+
int32_t state32 = ((int32_t) (*filter_state) * (1 << 16)); // Q15
|
94
|
+
|
95
|
+
for (i = 0; i < data_length; i++) {
|
96
|
+
tmp32 = state32 + filter_coefficient * *data_in;
|
97
|
+
tmp16 = (int16_t) (tmp32 >> 16); // Q(-1)
|
98
|
+
*data_out++ = tmp16;
|
99
|
+
state32 = (*data_in * (1 << 14)) - filter_coefficient * tmp16; // Q14
|
100
|
+
state32 *= 2; // Q15.
|
101
|
+
data_in += 2;
|
102
|
+
}
|
103
|
+
|
104
|
+
*filter_state = (int16_t) (state32 >> 16); // Q(-1)
|
105
|
+
}
|
106
|
+
|
107
|
+
// Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
|
108
|
+
// an upper (high pass) part and a lower (low pass) part respectively.
|
109
|
+
//
|
110
|
+
// - data_in [i] : Input audio data to be split into two frequency bands.
|
111
|
+
// - data_length [i] : Length of |data_in|.
|
112
|
+
// - upper_state [i/o] : State of the upper filter, given in Q(-1).
|
113
|
+
// - lower_state [i/o] : State of the lower filter, given in Q(-1).
|
114
|
+
// - hp_data_out [o] : Output audio data of the upper half of the spectrum.
|
115
|
+
// The length is |data_length| / 2.
|
116
|
+
// - lp_data_out [o] : Output audio data of the lower half of the spectrum.
|
117
|
+
// The length is |data_length| / 2.
|
118
|
+
static void SplitFilter(const int16_t* data_in, size_t data_length,
|
119
|
+
int16_t* upper_state, int16_t* lower_state,
|
120
|
+
int16_t* hp_data_out, int16_t* lp_data_out) {
|
121
|
+
size_t i;
|
122
|
+
size_t half_length = data_length >> 1; // Downsampling by 2.
|
123
|
+
int16_t tmp_out;
|
124
|
+
|
125
|
+
// All-pass filtering upper branch.
|
126
|
+
AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
|
127
|
+
hp_data_out);
|
128
|
+
|
129
|
+
// All-pass filtering lower branch.
|
130
|
+
AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
|
131
|
+
lp_data_out);
|
132
|
+
|
133
|
+
// Make LP and HP signals.
|
134
|
+
for (i = 0; i < half_length; i++) {
|
135
|
+
tmp_out = *hp_data_out;
|
136
|
+
*hp_data_out++ -= *lp_data_out;
|
137
|
+
*lp_data_out++ += tmp_out;
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
// Calculates the energy of |data_in| in dB, and also updates an overall
|
142
|
+
// |total_energy| if necessary.
|
143
|
+
//
|
144
|
+
// - data_in [i] : Input audio data for energy calculation.
|
145
|
+
// - data_length [i] : Length of input data.
|
146
|
+
// - offset [i] : Offset value added to |log_energy|.
|
147
|
+
// - total_energy [i/o] : An external energy updated with the energy of
|
148
|
+
// |data_in|.
|
149
|
+
// NOTE: |total_energy| is only updated if
|
150
|
+
// |total_energy| <= |kMinEnergy|.
|
151
|
+
// - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
|
152
|
+
static void LogOfEnergy(const int16_t* data_in, size_t data_length,
|
153
|
+
int16_t offset, int16_t* total_energy,
|
154
|
+
int16_t* log_energy) {
|
155
|
+
// |tot_rshifts| accumulates the number of right shifts performed on |energy|.
|
156
|
+
int tot_rshifts = 0;
|
157
|
+
// The |energy| will be normalized to 15 bits. We use unsigned integer because
|
158
|
+
// we eventually will mask out the fractional part.
|
159
|
+
uint32_t energy = 0;
|
160
|
+
|
161
|
+
RTC_DCHECK(data_in);
|
162
|
+
RTC_DCHECK_GT(data_length, 0);
|
163
|
+
|
164
|
+
energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
|
165
|
+
&tot_rshifts);
|
166
|
+
|
167
|
+
if (energy != 0) {
|
168
|
+
// By construction, normalizing to 15 bits is equivalent with 17 leading
|
169
|
+
// zeros of an unsigned 32 bit value.
|
170
|
+
int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
|
171
|
+
// In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
|
172
|
+
// (14 << 10), which is what we initialize |log2_energy| with. For a more
|
173
|
+
// detailed derivations, see below.
|
174
|
+
int16_t log2_energy = kLogEnergyIntPart;
|
175
|
+
|
176
|
+
tot_rshifts += normalizing_rshifts;
|
177
|
+
// Normalize |energy| to 15 bits.
|
178
|
+
// |tot_rshifts| is now the total number of right shifts performed on
|
179
|
+
// |energy| after normalization. This means that |energy| is in
|
180
|
+
// Q(-tot_rshifts).
|
181
|
+
if (normalizing_rshifts < 0) {
|
182
|
+
energy <<= -normalizing_rshifts;
|
183
|
+
} else {
|
184
|
+
energy >>= normalizing_rshifts;
|
185
|
+
}
|
186
|
+
|
187
|
+
// Calculate the energy of |data_in| in dB, in Q4.
|
188
|
+
//
|
189
|
+
// 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
|
190
|
+
// 160 * log10(|energy| * 2^|tot_rshifts|) =
|
191
|
+
// 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
|
192
|
+
// 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
|
193
|
+
// (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
|
194
|
+
// |kLogConst| * (|log2_energy| + |tot_rshifts|)
|
195
|
+
//
|
196
|
+
// We know by construction that |energy| is normalized to 15 bits. Hence,
|
197
|
+
// |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
|
198
|
+
// Further, we'd like |log2_energy| in Q10
|
199
|
+
// log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
|
200
|
+
// 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
|
201
|
+
// 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
|
202
|
+
// (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
|
203
|
+
// (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
|
204
|
+
//
|
205
|
+
// Note that frac_Q15 = (|energy| & 0x00003FFF)
|
206
|
+
|
207
|
+
// Calculate and add the fractional part to |log2_energy|.
|
208
|
+
log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);
|
209
|
+
|
210
|
+
// |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
|
211
|
+
// Note that we in our derivation above have accounted for an output in Q4.
|
212
|
+
*log_energy = (int16_t)(((kLogConst * log2_energy) >> 19) +
|
213
|
+
((tot_rshifts * kLogConst) >> 9));
|
214
|
+
|
215
|
+
if (*log_energy < 0) {
|
216
|
+
*log_energy = 0;
|
217
|
+
}
|
218
|
+
} else {
|
219
|
+
*log_energy = offset;
|
220
|
+
return;
|
221
|
+
}
|
222
|
+
|
223
|
+
*log_energy += offset;
|
224
|
+
|
225
|
+
// Update the approximate |total_energy| with the energy of |data_in|, if
|
226
|
+
// |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
|
227
|
+
// energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
|
228
|
+
if (*total_energy <= kMinEnergy) {
|
229
|
+
if (tot_rshifts >= 0) {
|
230
|
+
// We know by construction that the |energy| > |kMinEnergy| in Q0, so add
|
231
|
+
// an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
|
232
|
+
*total_energy += kMinEnergy + 1;
|
233
|
+
} else {
|
234
|
+
// By construction |energy| is represented by 15 bits, hence any number of
|
235
|
+
// right shifted |energy| will fit in an int16_t. In addition, adding the
|
236
|
+
// value to |total_energy| is wrap around safe as long as
|
237
|
+
// |kMinEnergy| < 8192.
|
238
|
+
*total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
|
244
|
+
size_t data_length, int16_t* features) {
|
245
|
+
int16_t total_energy = 0;
|
246
|
+
// We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
|
247
|
+
// 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
|
248
|
+
// have at most 120 samples after the first split and at most 60 samples after
|
249
|
+
// the second split.
|
250
|
+
int16_t hp_120[120], lp_120[120];
|
251
|
+
int16_t hp_60[60], lp_60[60];
|
252
|
+
const size_t half_data_length = data_length >> 1;
|
253
|
+
size_t length = half_data_length; // |data_length| / 2, corresponds to
|
254
|
+
// bandwidth = 2000 Hz after downsampling.
|
255
|
+
|
256
|
+
// Initialize variables for the first SplitFilter().
|
257
|
+
int frequency_band = 0;
|
258
|
+
const int16_t* in_ptr = data_in; // [0 - 4000] Hz.
|
259
|
+
int16_t* hp_out_ptr = hp_120; // [2000 - 4000] Hz.
|
260
|
+
int16_t* lp_out_ptr = lp_120; // [0 - 2000] Hz.
|
261
|
+
|
262
|
+
RTC_DCHECK_LE(data_length, 240);
|
263
|
+
RTC_DCHECK_LT(4, kNumChannels - 1); // Checking maximum |frequency_band|.
|
264
|
+
|
265
|
+
// Split at 2000 Hz and downsample.
|
266
|
+
SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
|
267
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
268
|
+
|
269
|
+
// For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
|
270
|
+
frequency_band = 1;
|
271
|
+
in_ptr = hp_120; // [2000 - 4000] Hz.
|
272
|
+
hp_out_ptr = hp_60; // [3000 - 4000] Hz.
|
273
|
+
lp_out_ptr = lp_60; // [2000 - 3000] Hz.
|
274
|
+
SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
|
275
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
276
|
+
|
277
|
+
// Energy in 3000 Hz - 4000 Hz.
|
278
|
+
length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
|
279
|
+
|
280
|
+
LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);
|
281
|
+
|
282
|
+
// Energy in 2000 Hz - 3000 Hz.
|
283
|
+
LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);
|
284
|
+
|
285
|
+
// For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
|
286
|
+
frequency_band = 2;
|
287
|
+
in_ptr = lp_120; // [0 - 2000] Hz.
|
288
|
+
hp_out_ptr = hp_60; // [1000 - 2000] Hz.
|
289
|
+
lp_out_ptr = lp_60; // [0 - 1000] Hz.
|
290
|
+
length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz.
|
291
|
+
SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
|
292
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
293
|
+
|
294
|
+
// Energy in 1000 Hz - 2000 Hz.
|
295
|
+
length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
|
296
|
+
LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);
|
297
|
+
|
298
|
+
// For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
|
299
|
+
frequency_band = 3;
|
300
|
+
in_ptr = lp_60; // [0 - 1000] Hz.
|
301
|
+
hp_out_ptr = hp_120; // [500 - 1000] Hz.
|
302
|
+
lp_out_ptr = lp_120; // [0 - 500] Hz.
|
303
|
+
SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
|
304
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
305
|
+
|
306
|
+
// Energy in 500 Hz - 1000 Hz.
|
307
|
+
length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz.
|
308
|
+
LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);
|
309
|
+
|
310
|
+
// For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
|
311
|
+
frequency_band = 4;
|
312
|
+
in_ptr = lp_120; // [0 - 500] Hz.
|
313
|
+
hp_out_ptr = hp_60; // [250 - 500] Hz.
|
314
|
+
lp_out_ptr = lp_60; // [0 - 250] Hz.
|
315
|
+
SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
|
316
|
+
&self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
|
317
|
+
|
318
|
+
// Energy in 250 Hz - 500 Hz.
|
319
|
+
length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz.
|
320
|
+
LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);
|
321
|
+
|
322
|
+
// Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
|
323
|
+
HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);
|
324
|
+
|
325
|
+
// Energy in 80 Hz - 250 Hz.
|
326
|
+
LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);
|
327
|
+
|
328
|
+
return total_energy;
|
329
|
+
}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license
|
5
|
+
* that can be found in the LICENSE file in the root of the source
|
6
|
+
* tree. An additional intellectual property rights grant can be found
|
7
|
+
* in the file PATENTS. All contributing project authors may
|
8
|
+
* be found in the AUTHORS file in the root of the source tree.
|
9
|
+
*/
|
10
|
+
|
11
|
+
/*
|
12
|
+
* This file includes feature calculating functionality used in vad_core.c.
|
13
|
+
*/
|
14
|
+
|
15
|
+
#ifndef COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
|
16
|
+
#define COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
|
17
|
+
|
18
|
+
#include "common_audio/vad/vad_core.h"
|
19
|
+
|
20
|
+
// Takes |data_length| samples of |data_in| and calculates the logarithm of the
|
21
|
+
// energy of each of the |kNumChannels| = 6 frequency bands used by the VAD:
|
22
|
+
// 80 Hz - 250 Hz
|
23
|
+
// 250 Hz - 500 Hz
|
24
|
+
// 500 Hz - 1000 Hz
|
25
|
+
// 1000 Hz - 2000 Hz
|
26
|
+
// 2000 Hz - 3000 Hz
|
27
|
+
// 3000 Hz - 4000 Hz
|
28
|
+
//
|
29
|
+
// The values are given in Q4 and written to |features|. Further, an approximate
|
30
|
+
// overall energy is returned. The return value is used in
|
31
|
+
// WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above
|
32
|
+
// the threshold |kMinEnergy|.
|
33
|
+
//
|
34
|
+
// - self [i/o] : State information of the VAD.
|
35
|
+
// - data_in [i] : Input audio data, for feature extraction.
|
36
|
+
// - data_length [i] : Audio data size, in number of samples.
|
37
|
+
// - features [o] : 10 * log10(energy in each frequency band), Q4.
|
38
|
+
// - returns : Total energy of the signal (NOTE! This value is not
|
39
|
+
// exact. It is only used in a comparison.)
|
40
|
+
int16_t WebRtcVad_CalculateFeatures(VadInstT* self,
|
41
|
+
const int16_t* data_in,
|
42
|
+
size_t data_length,
|
43
|
+
int16_t* features);
|
44
|
+
|
45
|
+
#endif // COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
|