webrtcvad 0.1.0 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/ext/webrtcvad/extconf.rb +29 -0
  3. data/ext/webrtcvad/webrtc/common_audio/signal_processing/division_operations.c +141 -0
  4. data/ext/webrtcvad/webrtc/common_audio/signal_processing/dot_product_with_scale.h +40 -0
  5. data/ext/webrtcvad/webrtc/common_audio/signal_processing/energy.c +39 -0
  6. data/ext/webrtcvad/webrtc/common_audio/signal_processing/get_scaling_square.c +46 -0
  7. data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/signal_processing_library.h +1605 -0
  8. data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/spl_inl.h +153 -0
  9. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_48khz.c +186 -0
  10. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.c +689 -0
  11. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.h +60 -0
  12. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_fractional.c +239 -0
  13. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c +77 -0
  14. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h +29 -0
  15. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_mips.c +207 -0
  16. data/ext/webrtcvad/webrtc/common_audio/vad/include/webrtc_vad.h +87 -0
  17. data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.c +685 -0
  18. data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.h +114 -0
  19. data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.c +329 -0
  20. data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.h +45 -0
  21. data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.c +82 -0
  22. data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.h +39 -0
  23. data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.c +176 -0
  24. data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.h +54 -0
  25. data/ext/webrtcvad/webrtc/common_audio/vad/webrtc_vad.c +114 -0
  26. data/ext/webrtcvad/webrtc/rtc_base/checks.cc +207 -0
  27. data/ext/webrtcvad/webrtc/rtc_base/checks.h +400 -0
  28. data/ext/webrtcvad/webrtc/rtc_base/compile_assert_c.h +25 -0
  29. data/ext/webrtcvad/webrtc/rtc_base/numerics/safe_compare.h +176 -0
  30. data/ext/webrtcvad/webrtc/rtc_base/sanitizer.h +144 -0
  31. data/ext/webrtcvad/webrtc/rtc_base/system/inline.h +31 -0
  32. data/ext/webrtcvad/webrtc/rtc_base/system/rtc_export.h +43 -0
  33. data/ext/webrtcvad/webrtc/rtc_base/type_traits.h +140 -0
  34. data/ext/webrtcvad/webrtcvad.c +112 -0
  35. metadata +37 -3
@@ -0,0 +1,114 @@
1
+ /*
2
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license
5
+ * that can be found in the LICENSE file in the root of the source
6
+ * tree. An additional intellectual property rights grant can be found
7
+ * in the file PATENTS. All contributing project authors may
8
+ * be found in the AUTHORS file in the root of the source tree.
9
+ */
10
+
11
+ /*
12
+ * This header file includes the descriptions of the core VAD calls.
13
+ */
14
+
15
+ #ifndef COMMON_AUDIO_VAD_VAD_CORE_H_
16
+ #define COMMON_AUDIO_VAD_VAD_CORE_H_
17
+
18
+ #include "common_audio/signal_processing/include/signal_processing_library.h"
19
+
20
+ enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
21
+ enum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM.
22
+ enum { kTableSize = kNumChannels * kNumGaussians };
23
+ enum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal.
24
+
25
+ typedef struct VadInstT_ {
26
+ int vad;
27
+ int32_t downsampling_filter_states[4];
28
+ WebRtcSpl_State48khzTo8khz state_48_to_8;
29
+ int16_t noise_means[kTableSize];
30
+ int16_t speech_means[kTableSize];
31
+ int16_t noise_stds[kTableSize];
32
+ int16_t speech_stds[kTableSize];
33
+ // TODO(bjornv): Change to |frame_count|.
34
+ int32_t frame_counter;
35
+ int16_t over_hang; // Over Hang
36
+ int16_t num_of_speech;
37
+ // TODO(bjornv): Change to |age_vector|.
38
+ int16_t index_vector[16 * kNumChannels];
39
+ int16_t low_value_vector[16 * kNumChannels];
40
+ // TODO(bjornv): Change to |median|.
41
+ int16_t mean_value[kNumChannels];
42
+ int16_t upper_state[5];
43
+ int16_t lower_state[5];
44
+ int16_t hp_filter_state[4];
45
+ int16_t over_hang_max_1[3];
46
+ int16_t over_hang_max_2[3];
47
+ int16_t individual[3];
48
+ int16_t total[3];
49
+
50
+ int init_flag;
51
+ } VadInstT;
52
+
53
+ // Initializes the core VAD component. The default aggressiveness mode is
54
+ // controlled by |kDefaultMode| in vad_core.c.
55
+ //
56
+ // - self [i/o] : Instance that should be initialized
57
+ //
58
+ // returns : 0 (OK), -1 (null pointer in or if the default mode can't be
59
+ // set)
60
+ int WebRtcVad_InitCore(VadInstT* self);
61
+
62
+ /****************************************************************************
63
+ * WebRtcVad_set_mode_core(...)
64
+ *
65
+ * This function changes the VAD settings
66
+ *
67
+ * Input:
68
+ * - inst : VAD instance
69
+ * - mode : Aggressiveness degree
70
+ * 0 (High quality) - 3 (Highly aggressive)
71
+ *
72
+ * Output:
73
+ * - inst : Changed instance
74
+ *
75
+ * Return value : 0 - Ok
76
+ * -1 - Error
77
+ */
78
+
79
+ int WebRtcVad_set_mode_core(VadInstT* self, int mode);
80
+
81
+ /****************************************************************************
82
+ * WebRtcVad_CalcVad48khz(...)
83
+ * WebRtcVad_CalcVad32khz(...)
84
+ * WebRtcVad_CalcVad16khz(...)
85
+ * WebRtcVad_CalcVad8khz(...)
86
+ *
87
+ * Calculate probability for active speech and make VAD decision.
88
+ *
89
+ * Input:
90
+ * - inst : Instance that should be initialized
91
+ * - speech_frame : Input speech frame
92
+ * - frame_length : Number of input samples
93
+ *
94
+ * Output:
95
+ * - inst : Updated filter states etc.
96
+ *
97
+ * Return value : VAD decision
98
+ * 0 - No active speech
99
+ * 1-6 - Active speech
100
+ */
101
+ int WebRtcVad_CalcVad48khz(VadInstT* inst,
102
+ const int16_t* speech_frame,
103
+ size_t frame_length);
104
+ int WebRtcVad_CalcVad32khz(VadInstT* inst,
105
+ const int16_t* speech_frame,
106
+ size_t frame_length);
107
+ int WebRtcVad_CalcVad16khz(VadInstT* inst,
108
+ const int16_t* speech_frame,
109
+ size_t frame_length);
110
+ int WebRtcVad_CalcVad8khz(VadInstT* inst,
111
+ const int16_t* speech_frame,
112
+ size_t frame_length);
113
+
114
+ #endif // COMMON_AUDIO_VAD_VAD_CORE_H_
@@ -0,0 +1,329 @@
1
+ /*
2
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license
5
+ * that can be found in the LICENSE file in the root of the source
6
+ * tree. An additional intellectual property rights grant can be found
7
+ * in the file PATENTS. All contributing project authors may
8
+ * be found in the AUTHORS file in the root of the source tree.
9
+ */
10
+
11
+ #include "common_audio/vad/vad_filterbank.h"
12
+
13
+ #include "rtc_base/checks.h"
14
+ #include "common_audio/signal_processing/include/signal_processing_library.h"
15
+
16
+ // Constants used in LogOfEnergy().
17
+ static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
18
+ static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10
19
+
20
+ // Coefficients used by HighPassFilter, Q14.
21
+ static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
22
+ static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 };
23
+
24
+ // Allpass filter coefficients, upper and lower, in Q15.
25
+ // Upper: 0.64, Lower: 0.17
26
+ static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };
27
+
28
+ // Adjustment for division with two in SplitFilter.
29
+ static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };
30
+
31
+ // High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
32
+ // sampled at 500 Hz.
33
+ //
34
+ // - data_in [i] : Input audio data sampled at 500 Hz.
35
+ // - data_length [i] : Length of input and output data.
36
+ // - filter_state [i/o] : State of the filter.
37
+ // - data_out [o] : Output audio data in the frequency interval
38
+ // 80 - 250 Hz.
39
+ static void HighPassFilter(const int16_t* data_in, size_t data_length,
40
+ int16_t* filter_state, int16_t* data_out) {
41
+ size_t i;
42
+ const int16_t* in_ptr = data_in;
43
+ int16_t* out_ptr = data_out;
44
+ int32_t tmp32 = 0;
45
+
46
+
47
+ // The sum of the absolute values of the impulse response:
48
+ // The zero/pole-filter has a max amplification of a single sample of: 1.4546
49
+ // Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194
50
+ // The all-zero section has a max amplification of a single sample of: 1.6189
51
+ // Impulse response: 0.4047 -0.8094 0.4047 0 0 0
52
+ // The all-pole section has a max amplification of a single sample of: 1.9931
53
+ // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532
54
+
55
+ for (i = 0; i < data_length; i++) {
56
+ // All-zero section (filter coefficients in Q14).
57
+ tmp32 = kHpZeroCoefs[0] * *in_ptr;
58
+ tmp32 += kHpZeroCoefs[1] * filter_state[0];
59
+ tmp32 += kHpZeroCoefs[2] * filter_state[1];
60
+ filter_state[1] = filter_state[0];
61
+ filter_state[0] = *in_ptr++;
62
+
63
+ // All-pole section (filter coefficients in Q14).
64
+ tmp32 -= kHpPoleCoefs[1] * filter_state[2];
65
+ tmp32 -= kHpPoleCoefs[2] * filter_state[3];
66
+ filter_state[3] = filter_state[2];
67
+ filter_state[2] = (int16_t) (tmp32 >> 14);
68
+ *out_ptr++ = filter_state[2];
69
+ }
70
+ }
71
+
72
+ // All pass filtering of |data_in|, used before splitting the signal into two
73
+ // frequency bands (low pass vs high pass).
74
+ // Note that |data_in| and |data_out| can NOT correspond to the same address.
75
+ //
76
+ // - data_in [i] : Input audio signal given in Q0.
77
+ // - data_length [i] : Length of input and output data.
78
+ // - filter_coefficient [i] : Given in Q15.
79
+ // - filter_state [i/o] : State of the filter given in Q(-1).
80
+ // - data_out [o] : Output audio signal given in Q(-1).
81
+ static void AllPassFilter(const int16_t* data_in, size_t data_length,
82
+ int16_t filter_coefficient, int16_t* filter_state,
83
+ int16_t* data_out) {
84
+ // The filter can only cause overflow (in the w16 output variable)
85
+ // if more than 4 consecutive input numbers are of maximum value and
86
+ // has the the same sign as the impulse responses first taps.
87
+ // First 6 taps of the impulse response:
88
+ // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
89
+
90
+ size_t i;
91
+ int16_t tmp16 = 0;
92
+ int32_t tmp32 = 0;
93
+ int32_t state32 = ((int32_t) (*filter_state) * (1 << 16)); // Q15
94
+
95
+ for (i = 0; i < data_length; i++) {
96
+ tmp32 = state32 + filter_coefficient * *data_in;
97
+ tmp16 = (int16_t) (tmp32 >> 16); // Q(-1)
98
+ *data_out++ = tmp16;
99
+ state32 = (*data_in * (1 << 14)) - filter_coefficient * tmp16; // Q14
100
+ state32 *= 2; // Q15.
101
+ data_in += 2;
102
+ }
103
+
104
+ *filter_state = (int16_t) (state32 >> 16); // Q(-1)
105
+ }
106
+
107
+ // Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
108
+ // an upper (high pass) part and a lower (low pass) part respectively.
109
+ //
110
+ // - data_in [i] : Input audio data to be split into two frequency bands.
111
+ // - data_length [i] : Length of |data_in|.
112
+ // - upper_state [i/o] : State of the upper filter, given in Q(-1).
113
+ // - lower_state [i/o] : State of the lower filter, given in Q(-1).
114
+ // - hp_data_out [o] : Output audio data of the upper half of the spectrum.
115
+ // The length is |data_length| / 2.
116
+ // - lp_data_out [o] : Output audio data of the lower half of the spectrum.
117
+ // The length is |data_length| / 2.
118
+ static void SplitFilter(const int16_t* data_in, size_t data_length,
119
+ int16_t* upper_state, int16_t* lower_state,
120
+ int16_t* hp_data_out, int16_t* lp_data_out) {
121
+ size_t i;
122
+ size_t half_length = data_length >> 1; // Downsampling by 2.
123
+ int16_t tmp_out;
124
+
125
+ // All-pass filtering upper branch.
126
+ AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
127
+ hp_data_out);
128
+
129
+ // All-pass filtering lower branch.
130
+ AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
131
+ lp_data_out);
132
+
133
+ // Make LP and HP signals.
134
+ for (i = 0; i < half_length; i++) {
135
+ tmp_out = *hp_data_out;
136
+ *hp_data_out++ -= *lp_data_out;
137
+ *lp_data_out++ += tmp_out;
138
+ }
139
+ }
140
+
141
+ // Calculates the energy of |data_in| in dB, and also updates an overall
142
+ // |total_energy| if necessary.
143
+ //
144
+ // - data_in [i] : Input audio data for energy calculation.
145
+ // - data_length [i] : Length of input data.
146
+ // - offset [i] : Offset value added to |log_energy|.
147
+ // - total_energy [i/o] : An external energy updated with the energy of
148
+ // |data_in|.
149
+ // NOTE: |total_energy| is only updated if
150
+ // |total_energy| <= |kMinEnergy|.
151
+ // - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
152
+ static void LogOfEnergy(const int16_t* data_in, size_t data_length,
153
+ int16_t offset, int16_t* total_energy,
154
+ int16_t* log_energy) {
155
+ // |tot_rshifts| accumulates the number of right shifts performed on |energy|.
156
+ int tot_rshifts = 0;
157
+ // The |energy| will be normalized to 15 bits. We use unsigned integer because
158
+ // we eventually will mask out the fractional part.
159
+ uint32_t energy = 0;
160
+
161
+ RTC_DCHECK(data_in);
162
+ RTC_DCHECK_GT(data_length, 0);
163
+
164
+ energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
165
+ &tot_rshifts);
166
+
167
+ if (energy != 0) {
168
+ // By construction, normalizing to 15 bits is equivalent with 17 leading
169
+ // zeros of an unsigned 32 bit value.
170
+ int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
171
+ // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
172
+ // (14 << 10), which is what we initialize |log2_energy| with. For a more
173
+ // detailed derivations, see below.
174
+ int16_t log2_energy = kLogEnergyIntPart;
175
+
176
+ tot_rshifts += normalizing_rshifts;
177
+ // Normalize |energy| to 15 bits.
178
+ // |tot_rshifts| is now the total number of right shifts performed on
179
+ // |energy| after normalization. This means that |energy| is in
180
+ // Q(-tot_rshifts).
181
+ if (normalizing_rshifts < 0) {
182
+ energy <<= -normalizing_rshifts;
183
+ } else {
184
+ energy >>= normalizing_rshifts;
185
+ }
186
+
187
+ // Calculate the energy of |data_in| in dB, in Q4.
188
+ //
189
+ // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
190
+ // 160 * log10(|energy| * 2^|tot_rshifts|) =
191
+ // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
192
+ // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
193
+ // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
194
+ // |kLogConst| * (|log2_energy| + |tot_rshifts|)
195
+ //
196
+ // We know by construction that |energy| is normalized to 15 bits. Hence,
197
+ // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
198
+ // Further, we'd like |log2_energy| in Q10
199
+ // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
200
+ // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
201
+ // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
202
+ // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
203
+ // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
204
+ //
205
+ // Note that frac_Q15 = (|energy| & 0x00003FFF)
206
+
207
+ // Calculate and add the fractional part to |log2_energy|.
208
+ log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);
209
+
210
+ // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
211
+ // Note that we in our derivation above have accounted for an output in Q4.
212
+ *log_energy = (int16_t)(((kLogConst * log2_energy) >> 19) +
213
+ ((tot_rshifts * kLogConst) >> 9));
214
+
215
+ if (*log_energy < 0) {
216
+ *log_energy = 0;
217
+ }
218
+ } else {
219
+ *log_energy = offset;
220
+ return;
221
+ }
222
+
223
+ *log_energy += offset;
224
+
225
+ // Update the approximate |total_energy| with the energy of |data_in|, if
226
+ // |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
227
+ // energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
228
+ if (*total_energy <= kMinEnergy) {
229
+ if (tot_rshifts >= 0) {
230
+ // We know by construction that the |energy| > |kMinEnergy| in Q0, so add
231
+ // an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
232
+ *total_energy += kMinEnergy + 1;
233
+ } else {
234
+ // By construction |energy| is represented by 15 bits, hence any number of
235
+ // right shifted |energy| will fit in an int16_t. In addition, adding the
236
+ // value to |total_energy| is wrap around safe as long as
237
+ // |kMinEnergy| < 8192.
238
+ *total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
239
+ }
240
+ }
241
+ }
242
+
243
+ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
244
+ size_t data_length, int16_t* features) {
245
+ int16_t total_energy = 0;
246
+ // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
247
+ // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
248
+ // have at most 120 samples after the first split and at most 60 samples after
249
+ // the second split.
250
+ int16_t hp_120[120], lp_120[120];
251
+ int16_t hp_60[60], lp_60[60];
252
+ const size_t half_data_length = data_length >> 1;
253
+ size_t length = half_data_length; // |data_length| / 2, corresponds to
254
+ // bandwidth = 2000 Hz after downsampling.
255
+
256
+ // Initialize variables for the first SplitFilter().
257
+ int frequency_band = 0;
258
+ const int16_t* in_ptr = data_in; // [0 - 4000] Hz.
259
+ int16_t* hp_out_ptr = hp_120; // [2000 - 4000] Hz.
260
+ int16_t* lp_out_ptr = lp_120; // [0 - 2000] Hz.
261
+
262
+ RTC_DCHECK_LE(data_length, 240);
263
+ RTC_DCHECK_LT(4, kNumChannels - 1); // Checking maximum |frequency_band|.
264
+
265
+ // Split at 2000 Hz and downsample.
266
+ SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
267
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
268
+
269
+ // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
270
+ frequency_band = 1;
271
+ in_ptr = hp_120; // [2000 - 4000] Hz.
272
+ hp_out_ptr = hp_60; // [3000 - 4000] Hz.
273
+ lp_out_ptr = lp_60; // [2000 - 3000] Hz.
274
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
275
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
276
+
277
+ // Energy in 3000 Hz - 4000 Hz.
278
+ length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
279
+
280
+ LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);
281
+
282
+ // Energy in 2000 Hz - 3000 Hz.
283
+ LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);
284
+
285
+ // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
286
+ frequency_band = 2;
287
+ in_ptr = lp_120; // [0 - 2000] Hz.
288
+ hp_out_ptr = hp_60; // [1000 - 2000] Hz.
289
+ lp_out_ptr = lp_60; // [0 - 1000] Hz.
290
+ length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz.
291
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
292
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
293
+
294
+ // Energy in 1000 Hz - 2000 Hz.
295
+ length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
296
+ LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);
297
+
298
+ // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
299
+ frequency_band = 3;
300
+ in_ptr = lp_60; // [0 - 1000] Hz.
301
+ hp_out_ptr = hp_120; // [500 - 1000] Hz.
302
+ lp_out_ptr = lp_120; // [0 - 500] Hz.
303
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
304
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
305
+
306
+ // Energy in 500 Hz - 1000 Hz.
307
+ length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz.
308
+ LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);
309
+
310
+ // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
311
+ frequency_band = 4;
312
+ in_ptr = lp_120; // [0 - 500] Hz.
313
+ hp_out_ptr = hp_60; // [250 - 500] Hz.
314
+ lp_out_ptr = lp_60; // [0 - 250] Hz.
315
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
316
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
317
+
318
+ // Energy in 250 Hz - 500 Hz.
319
+ length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz.
320
+ LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);
321
+
322
+ // Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
323
+ HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);
324
+
325
+ // Energy in 80 Hz - 250 Hz.
326
+ LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);
327
+
328
+ return total_energy;
329
+ }
@@ -0,0 +1,45 @@
1
+ /*
2
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license
5
+ * that can be found in the LICENSE file in the root of the source
6
+ * tree. An additional intellectual property rights grant can be found
7
+ * in the file PATENTS. All contributing project authors may
8
+ * be found in the AUTHORS file in the root of the source tree.
9
+ */
10
+
11
+ /*
12
+ * This file includes feature calculating functionality used in vad_core.c.
13
+ */
14
+
15
+ #ifndef COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
16
+ #define COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
17
+
18
+ #include "common_audio/vad/vad_core.h"
19
+
20
+ // Takes |data_length| samples of |data_in| and calculates the logarithm of the
21
+ // energy of each of the |kNumChannels| = 6 frequency bands used by the VAD:
22
+ // 80 Hz - 250 Hz
23
+ // 250 Hz - 500 Hz
24
+ // 500 Hz - 1000 Hz
25
+ // 1000 Hz - 2000 Hz
26
+ // 2000 Hz - 3000 Hz
27
+ // 3000 Hz - 4000 Hz
28
+ //
29
+ // The values are given in Q4 and written to |features|. Further, an approximate
30
+ // overall energy is returned. The return value is used in
31
+ // WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above
32
+ // the threshold |kMinEnergy|.
33
+ //
34
+ // - self [i/o] : State information of the VAD.
35
+ // - data_in [i] : Input audio data, for feature extraction.
36
+ // - data_length [i] : Audio data size, in number of samples.
37
+ // - features [o] : 10 * log10(energy in each frequency band), Q4.
38
+ // - returns : Total energy of the signal (NOTE! This value is not
39
+ // exact. It is only used in a comparison.)
40
+ int16_t WebRtcVad_CalculateFeatures(VadInstT* self,
41
+ const int16_t* data_in,
42
+ size_t data_length,
43
+ int16_t* features);
44
+
45
+ #endif // COMMON_AUDIO_VAD_VAD_FILTERBANK_H_