webrtcvad 0.1.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/ext/webrtcvad/extconf.rb +29 -0
  3. data/ext/webrtcvad/webrtc/common_audio/signal_processing/division_operations.c +141 -0
  4. data/ext/webrtcvad/webrtc/common_audio/signal_processing/dot_product_with_scale.h +40 -0
  5. data/ext/webrtcvad/webrtc/common_audio/signal_processing/energy.c +39 -0
  6. data/ext/webrtcvad/webrtc/common_audio/signal_processing/get_scaling_square.c +46 -0
  7. data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/signal_processing_library.h +1605 -0
  8. data/ext/webrtcvad/webrtc/common_audio/signal_processing/include/spl_inl.h +153 -0
  9. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_48khz.c +186 -0
  10. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.c +689 -0
  11. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.h +60 -0
  12. data/ext/webrtcvad/webrtc/common_audio/signal_processing/resample_fractional.c +239 -0
  13. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c +77 -0
  14. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h +29 -0
  15. data/ext/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_mips.c +207 -0
  16. data/ext/webrtcvad/webrtc/common_audio/vad/include/webrtc_vad.h +87 -0
  17. data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.c +685 -0
  18. data/ext/webrtcvad/webrtc/common_audio/vad/vad_core.h +114 -0
  19. data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.c +329 -0
  20. data/ext/webrtcvad/webrtc/common_audio/vad/vad_filterbank.h +45 -0
  21. data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.c +82 -0
  22. data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.h +39 -0
  23. data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.c +176 -0
  24. data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.h +54 -0
  25. data/ext/webrtcvad/webrtc/common_audio/vad/webrtc_vad.c +114 -0
  26. data/ext/webrtcvad/webrtc/rtc_base/checks.cc +207 -0
  27. data/ext/webrtcvad/webrtc/rtc_base/checks.h +400 -0
  28. data/ext/webrtcvad/webrtc/rtc_base/compile_assert_c.h +25 -0
  29. data/ext/webrtcvad/webrtc/rtc_base/numerics/safe_compare.h +176 -0
  30. data/ext/webrtcvad/webrtc/rtc_base/sanitizer.h +144 -0
  31. data/ext/webrtcvad/webrtc/rtc_base/system/inline.h +31 -0
  32. data/ext/webrtcvad/webrtc/rtc_base/system/rtc_export.h +43 -0
  33. data/ext/webrtcvad/webrtc/rtc_base/type_traits.h +140 -0
  34. data/ext/webrtcvad/webrtcvad.c +112 -0
  35. metadata +37 -3
@@ -0,0 +1,114 @@
1
+ /*
2
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license
5
+ * that can be found in the LICENSE file in the root of the source
6
+ * tree. An additional intellectual property rights grant can be found
7
+ * in the file PATENTS. All contributing project authors may
8
+ * be found in the AUTHORS file in the root of the source tree.
9
+ */
10
+
11
+ /*
12
+ * This header file includes the descriptions of the core VAD calls.
13
+ */
14
+
15
+ #ifndef COMMON_AUDIO_VAD_VAD_CORE_H_
16
+ #define COMMON_AUDIO_VAD_VAD_CORE_H_
17
+
18
+ #include "common_audio/signal_processing/include/signal_processing_library.h"
19
+
20
+ enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
21
+ enum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM.
22
+ enum { kTableSize = kNumChannels * kNumGaussians };
23
+ enum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal.
24
+
25
+ typedef struct VadInstT_ {
26
+ int vad;
27
+ int32_t downsampling_filter_states[4];
28
+ WebRtcSpl_State48khzTo8khz state_48_to_8;
29
+ int16_t noise_means[kTableSize];
30
+ int16_t speech_means[kTableSize];
31
+ int16_t noise_stds[kTableSize];
32
+ int16_t speech_stds[kTableSize];
33
+ // TODO(bjornv): Change to |frame_count|.
34
+ int32_t frame_counter;
35
+ int16_t over_hang; // Over Hang
36
+ int16_t num_of_speech;
37
+ // TODO(bjornv): Change to |age_vector|.
38
+ int16_t index_vector[16 * kNumChannels];
39
+ int16_t low_value_vector[16 * kNumChannels];
40
+ // TODO(bjornv): Change to |median|.
41
+ int16_t mean_value[kNumChannels];
42
+ int16_t upper_state[5];
43
+ int16_t lower_state[5];
44
+ int16_t hp_filter_state[4];
45
+ int16_t over_hang_max_1[3];
46
+ int16_t over_hang_max_2[3];
47
+ int16_t individual[3];
48
+ int16_t total[3];
49
+
50
+ int init_flag;
51
+ } VadInstT;
52
+
53
+ // Initializes the core VAD component. The default aggressiveness mode is
54
+ // controlled by |kDefaultMode| in vad_core.c.
55
+ //
56
+ // - self [i/o] : Instance that should be initialized
57
+ //
58
+ // returns : 0 (OK), -1 (null pointer in or if the default mode can't be
59
+ // set)
60
+ int WebRtcVad_InitCore(VadInstT* self);
61
+
62
+ /****************************************************************************
63
+ * WebRtcVad_set_mode_core(...)
64
+ *
65
+ * This function changes the VAD settings
66
+ *
67
+ * Input:
68
+ * - inst : VAD instance
69
+ * - mode : Aggressiveness degree
70
+ * 0 (High quality) - 3 (Highly aggressive)
71
+ *
72
+ * Output:
73
+ * - inst : Changed instance
74
+ *
75
+ * Return value : 0 - Ok
76
+ * -1 - Error
77
+ */
78
+
79
+ int WebRtcVad_set_mode_core(VadInstT* self, int mode);
80
+
81
+ /****************************************************************************
82
+ * WebRtcVad_CalcVad48khz(...)
83
+ * WebRtcVad_CalcVad32khz(...)
84
+ * WebRtcVad_CalcVad16khz(...)
85
+ * WebRtcVad_CalcVad8khz(...)
86
+ *
87
+ * Calculate probability for active speech and make VAD decision.
88
+ *
89
+ * Input:
90
+ * - inst : Instance that should be initialized
91
+ * - speech_frame : Input speech frame
92
+ * - frame_length : Number of input samples
93
+ *
94
+ * Output:
95
+ * - inst : Updated filter states etc.
96
+ *
97
+ * Return value : VAD decision
98
+ * 0 - No active speech
99
+ * 1-6 - Active speech
100
+ */
101
+ int WebRtcVad_CalcVad48khz(VadInstT* inst,
102
+ const int16_t* speech_frame,
103
+ size_t frame_length);
104
+ int WebRtcVad_CalcVad32khz(VadInstT* inst,
105
+ const int16_t* speech_frame,
106
+ size_t frame_length);
107
+ int WebRtcVad_CalcVad16khz(VadInstT* inst,
108
+ const int16_t* speech_frame,
109
+ size_t frame_length);
110
+ int WebRtcVad_CalcVad8khz(VadInstT* inst,
111
+ const int16_t* speech_frame,
112
+ size_t frame_length);
113
+
114
+ #endif // COMMON_AUDIO_VAD_VAD_CORE_H_
@@ -0,0 +1,329 @@
1
+ /*
2
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license
5
+ * that can be found in the LICENSE file in the root of the source
6
+ * tree. An additional intellectual property rights grant can be found
7
+ * in the file PATENTS. All contributing project authors may
8
+ * be found in the AUTHORS file in the root of the source tree.
9
+ */
10
+
11
+ #include "common_audio/vad/vad_filterbank.h"
12
+
13
+ #include "rtc_base/checks.h"
14
+ #include "common_audio/signal_processing/include/signal_processing_library.h"
15
+
16
+ // Constants used in LogOfEnergy().
17
+ static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
18
+ static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10
19
+
20
+ // Coefficients used by HighPassFilter, Q14.
21
+ static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
22
+ static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 };
23
+
24
+ // Allpass filter coefficients, upper and lower, in Q15.
25
+ // Upper: 0.64, Lower: 0.17
26
+ static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };
27
+
28
+ // Adjustment for division with two in SplitFilter.
29
+ static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };
30
+
31
+ // High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
32
+ // sampled at 500 Hz.
33
+ //
34
+ // - data_in [i] : Input audio data sampled at 500 Hz.
35
+ // - data_length [i] : Length of input and output data.
36
+ // - filter_state [i/o] : State of the filter.
37
+ // - data_out [o] : Output audio data in the frequency interval
38
+ // 80 - 250 Hz.
39
+ static void HighPassFilter(const int16_t* data_in, size_t data_length,
40
+ int16_t* filter_state, int16_t* data_out) {
41
+ size_t i;
42
+ const int16_t* in_ptr = data_in;
43
+ int16_t* out_ptr = data_out;
44
+ int32_t tmp32 = 0;
45
+
46
+
47
+ // The sum of the absolute values of the impulse response:
48
+ // The zero/pole-filter has a max amplification of a single sample of: 1.4546
49
+ // Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194
50
+ // The all-zero section has a max amplification of a single sample of: 1.6189
51
+ // Impulse response: 0.4047 -0.8094 0.4047 0 0 0
52
+ // The all-pole section has a max amplification of a single sample of: 1.9931
53
+ // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532
54
+
55
+ for (i = 0; i < data_length; i++) {
56
+ // All-zero section (filter coefficients in Q14).
57
+ tmp32 = kHpZeroCoefs[0] * *in_ptr;
58
+ tmp32 += kHpZeroCoefs[1] * filter_state[0];
59
+ tmp32 += kHpZeroCoefs[2] * filter_state[1];
60
+ filter_state[1] = filter_state[0];
61
+ filter_state[0] = *in_ptr++;
62
+
63
+ // All-pole section (filter coefficients in Q14).
64
+ tmp32 -= kHpPoleCoefs[1] * filter_state[2];
65
+ tmp32 -= kHpPoleCoefs[2] * filter_state[3];
66
+ filter_state[3] = filter_state[2];
67
+ filter_state[2] = (int16_t) (tmp32 >> 14);
68
+ *out_ptr++ = filter_state[2];
69
+ }
70
+ }
71
+
72
+ // All pass filtering of |data_in|, used before splitting the signal into two
73
+ // frequency bands (low pass vs high pass).
74
+ // Note that |data_in| and |data_out| can NOT correspond to the same address.
75
+ //
76
+ // - data_in [i] : Input audio signal given in Q0.
77
+ // - data_length [i] : Length of input and output data.
78
+ // - filter_coefficient [i] : Given in Q15.
79
+ // - filter_state [i/o] : State of the filter given in Q(-1).
80
+ // - data_out [o] : Output audio signal given in Q(-1).
81
+ static void AllPassFilter(const int16_t* data_in, size_t data_length,
82
+ int16_t filter_coefficient, int16_t* filter_state,
83
+ int16_t* data_out) {
84
+ // The filter can only cause overflow (in the w16 output variable)
85
+ // if more than 4 consecutive input numbers are of maximum value and
86
+ // has the the same sign as the impulse responses first taps.
87
+ // First 6 taps of the impulse response:
88
+ // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
89
+
90
+ size_t i;
91
+ int16_t tmp16 = 0;
92
+ int32_t tmp32 = 0;
93
+ int32_t state32 = ((int32_t) (*filter_state) * (1 << 16)); // Q15
94
+
95
+ for (i = 0; i < data_length; i++) {
96
+ tmp32 = state32 + filter_coefficient * *data_in;
97
+ tmp16 = (int16_t) (tmp32 >> 16); // Q(-1)
98
+ *data_out++ = tmp16;
99
+ state32 = (*data_in * (1 << 14)) - filter_coefficient * tmp16; // Q14
100
+ state32 *= 2; // Q15.
101
+ data_in += 2;
102
+ }
103
+
104
+ *filter_state = (int16_t) (state32 >> 16); // Q(-1)
105
+ }
106
+
107
+ // Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
108
+ // an upper (high pass) part and a lower (low pass) part respectively.
109
+ //
110
+ // - data_in [i] : Input audio data to be split into two frequency bands.
111
+ // - data_length [i] : Length of |data_in|.
112
+ // - upper_state [i/o] : State of the upper filter, given in Q(-1).
113
+ // - lower_state [i/o] : State of the lower filter, given in Q(-1).
114
+ // - hp_data_out [o] : Output audio data of the upper half of the spectrum.
115
+ // The length is |data_length| / 2.
116
+ // - lp_data_out [o] : Output audio data of the lower half of the spectrum.
117
+ // The length is |data_length| / 2.
118
+ static void SplitFilter(const int16_t* data_in, size_t data_length,
119
+ int16_t* upper_state, int16_t* lower_state,
120
+ int16_t* hp_data_out, int16_t* lp_data_out) {
121
+ size_t i;
122
+ size_t half_length = data_length >> 1; // Downsampling by 2.
123
+ int16_t tmp_out;
124
+
125
+ // All-pass filtering upper branch.
126
+ AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
127
+ hp_data_out);
128
+
129
+ // All-pass filtering lower branch.
130
+ AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
131
+ lp_data_out);
132
+
133
+ // Make LP and HP signals.
134
+ for (i = 0; i < half_length; i++) {
135
+ tmp_out = *hp_data_out;
136
+ *hp_data_out++ -= *lp_data_out;
137
+ *lp_data_out++ += tmp_out;
138
+ }
139
+ }
140
+
141
+ // Calculates the energy of |data_in| in dB, and also updates an overall
142
+ // |total_energy| if necessary.
143
+ //
144
+ // - data_in [i] : Input audio data for energy calculation.
145
+ // - data_length [i] : Length of input data.
146
+ // - offset [i] : Offset value added to |log_energy|.
147
+ // - total_energy [i/o] : An external energy updated with the energy of
148
+ // |data_in|.
149
+ // NOTE: |total_energy| is only updated if
150
+ // |total_energy| <= |kMinEnergy|.
151
+ // - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
152
+ static void LogOfEnergy(const int16_t* data_in, size_t data_length,
153
+ int16_t offset, int16_t* total_energy,
154
+ int16_t* log_energy) {
155
+ // |tot_rshifts| accumulates the number of right shifts performed on |energy|.
156
+ int tot_rshifts = 0;
157
+ // The |energy| will be normalized to 15 bits. We use unsigned integer because
158
+ // we eventually will mask out the fractional part.
159
+ uint32_t energy = 0;
160
+
161
+ RTC_DCHECK(data_in);
162
+ RTC_DCHECK_GT(data_length, 0);
163
+
164
+ energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
165
+ &tot_rshifts);
166
+
167
+ if (energy != 0) {
168
+ // By construction, normalizing to 15 bits is equivalent with 17 leading
169
+ // zeros of an unsigned 32 bit value.
170
+ int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
171
+ // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
172
+ // (14 << 10), which is what we initialize |log2_energy| with. For a more
173
+ // detailed derivations, see below.
174
+ int16_t log2_energy = kLogEnergyIntPart;
175
+
176
+ tot_rshifts += normalizing_rshifts;
177
+ // Normalize |energy| to 15 bits.
178
+ // |tot_rshifts| is now the total number of right shifts performed on
179
+ // |energy| after normalization. This means that |energy| is in
180
+ // Q(-tot_rshifts).
181
+ if (normalizing_rshifts < 0) {
182
+ energy <<= -normalizing_rshifts;
183
+ } else {
184
+ energy >>= normalizing_rshifts;
185
+ }
186
+
187
+ // Calculate the energy of |data_in| in dB, in Q4.
188
+ //
189
+ // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
190
+ // 160 * log10(|energy| * 2^|tot_rshifts|) =
191
+ // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
192
+ // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
193
+ // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
194
+ // |kLogConst| * (|log2_energy| + |tot_rshifts|)
195
+ //
196
+ // We know by construction that |energy| is normalized to 15 bits. Hence,
197
+ // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
198
+ // Further, we'd like |log2_energy| in Q10
199
+ // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
200
+ // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
201
+ // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
202
+ // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
203
+ // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
204
+ //
205
+ // Note that frac_Q15 = (|energy| & 0x00003FFF)
206
+
207
+ // Calculate and add the fractional part to |log2_energy|.
208
+ log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);
209
+
210
+ // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
211
+ // Note that we in our derivation above have accounted for an output in Q4.
212
+ *log_energy = (int16_t)(((kLogConst * log2_energy) >> 19) +
213
+ ((tot_rshifts * kLogConst) >> 9));
214
+
215
+ if (*log_energy < 0) {
216
+ *log_energy = 0;
217
+ }
218
+ } else {
219
+ *log_energy = offset;
220
+ return;
221
+ }
222
+
223
+ *log_energy += offset;
224
+
225
+ // Update the approximate |total_energy| with the energy of |data_in|, if
226
+ // |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
227
+ // energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
228
+ if (*total_energy <= kMinEnergy) {
229
+ if (tot_rshifts >= 0) {
230
+ // We know by construction that the |energy| > |kMinEnergy| in Q0, so add
231
+ // an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
232
+ *total_energy += kMinEnergy + 1;
233
+ } else {
234
+ // By construction |energy| is represented by 15 bits, hence any number of
235
+ // right shifted |energy| will fit in an int16_t. In addition, adding the
236
+ // value to |total_energy| is wrap around safe as long as
237
+ // |kMinEnergy| < 8192.
238
+ *total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
239
+ }
240
+ }
241
+ }
242
+
243
+ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
244
+ size_t data_length, int16_t* features) {
245
+ int16_t total_energy = 0;
246
+ // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
247
+ // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
248
+ // have at most 120 samples after the first split and at most 60 samples after
249
+ // the second split.
250
+ int16_t hp_120[120], lp_120[120];
251
+ int16_t hp_60[60], lp_60[60];
252
+ const size_t half_data_length = data_length >> 1;
253
+ size_t length = half_data_length; // |data_length| / 2, corresponds to
254
+ // bandwidth = 2000 Hz after downsampling.
255
+
256
+ // Initialize variables for the first SplitFilter().
257
+ int frequency_band = 0;
258
+ const int16_t* in_ptr = data_in; // [0 - 4000] Hz.
259
+ int16_t* hp_out_ptr = hp_120; // [2000 - 4000] Hz.
260
+ int16_t* lp_out_ptr = lp_120; // [0 - 2000] Hz.
261
+
262
+ RTC_DCHECK_LE(data_length, 240);
263
+ RTC_DCHECK_LT(4, kNumChannels - 1); // Checking maximum |frequency_band|.
264
+
265
+ // Split at 2000 Hz and downsample.
266
+ SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
267
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
268
+
269
+ // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
270
+ frequency_band = 1;
271
+ in_ptr = hp_120; // [2000 - 4000] Hz.
272
+ hp_out_ptr = hp_60; // [3000 - 4000] Hz.
273
+ lp_out_ptr = lp_60; // [2000 - 3000] Hz.
274
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
275
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
276
+
277
+ // Energy in 3000 Hz - 4000 Hz.
278
+ length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
279
+
280
+ LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);
281
+
282
+ // Energy in 2000 Hz - 3000 Hz.
283
+ LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);
284
+
285
+ // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
286
+ frequency_band = 2;
287
+ in_ptr = lp_120; // [0 - 2000] Hz.
288
+ hp_out_ptr = hp_60; // [1000 - 2000] Hz.
289
+ lp_out_ptr = lp_60; // [0 - 1000] Hz.
290
+ length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz.
291
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
292
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
293
+
294
+ // Energy in 1000 Hz - 2000 Hz.
295
+ length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
296
+ LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);
297
+
298
+ // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
299
+ frequency_band = 3;
300
+ in_ptr = lp_60; // [0 - 1000] Hz.
301
+ hp_out_ptr = hp_120; // [500 - 1000] Hz.
302
+ lp_out_ptr = lp_120; // [0 - 500] Hz.
303
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
304
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
305
+
306
+ // Energy in 500 Hz - 1000 Hz.
307
+ length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz.
308
+ LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);
309
+
310
+ // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
311
+ frequency_band = 4;
312
+ in_ptr = lp_120; // [0 - 500] Hz.
313
+ hp_out_ptr = hp_60; // [250 - 500] Hz.
314
+ lp_out_ptr = lp_60; // [0 - 250] Hz.
315
+ SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
316
+ &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
317
+
318
+ // Energy in 250 Hz - 500 Hz.
319
+ length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz.
320
+ LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);
321
+
322
+ // Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
323
+ HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);
324
+
325
+ // Energy in 80 Hz - 250 Hz.
326
+ LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);
327
+
328
+ return total_energy;
329
+ }
@@ -0,0 +1,45 @@
1
+ /*
2
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license
5
+ * that can be found in the LICENSE file in the root of the source
6
+ * tree. An additional intellectual property rights grant can be found
7
+ * in the file PATENTS. All contributing project authors may
8
+ * be found in the AUTHORS file in the root of the source tree.
9
+ */
10
+
11
+ /*
12
+ * This file includes feature calculating functionality used in vad_core.c.
13
+ */
14
+
15
+ #ifndef COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
16
+ #define COMMON_AUDIO_VAD_VAD_FILTERBANK_H_
17
+
18
+ #include "common_audio/vad/vad_core.h"
19
+
20
+ // Takes |data_length| samples of |data_in| and calculates the logarithm of the
21
+ // energy of each of the |kNumChannels| = 6 frequency bands used by the VAD:
22
+ // 80 Hz - 250 Hz
23
+ // 250 Hz - 500 Hz
24
+ // 500 Hz - 1000 Hz
25
+ // 1000 Hz - 2000 Hz
26
+ // 2000 Hz - 3000 Hz
27
+ // 3000 Hz - 4000 Hz
28
+ //
29
+ // The values are given in Q4 and written to |features|. Further, an approximate
30
+ // overall energy is returned. The return value is used in
31
+ // WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above
32
+ // the threshold |kMinEnergy|.
33
+ //
34
+ // - self [i/o] : State information of the VAD.
35
+ // - data_in [i] : Input audio data, for feature extraction.
36
+ // - data_length [i] : Audio data size, in number of samples.
37
+ // - features [o] : 10 * log10(energy in each frequency band), Q4.
38
+ // - returns : Total energy of the signal (NOTE! This value is not
39
+ // exact. It is only used in a comparison.)
40
+ int16_t WebRtcVad_CalculateFeatures(VadInstT* self,
41
+ const int16_t* data_in,
42
+ size_t data_length,
43
+ int16_t* features);
44
+
45
+ #endif // COMMON_AUDIO_VAD_VAD_FILTERBANK_H_