RubyGems - webrtcvad - Versions diffs - 0.1.0 → 0.2.3 - Mend

webrtcvad 0.1.0 → 0.2.3

Files changed (35) hide show

data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.c ADDED

@@ -0,0 +1,82 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "common_audio/vad/vad_gmm.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+static const int32_t kCompVar = 22005;
+static const int16_t kLog2Exp = 5909;  // log2(exp(1)) in Q12.
+// For a normal distribution, the probability of |input| is calculated and
+// returned (in Q20). The formula for normal distributed probability is
+//
+// 1 / s * exp(-(x - m)^2 / (2 * s^2))
+//
+// where the parameters are given in the following Q domains:
+// m = |mean| (Q7)
+// s = |std| (Q7)
+// x = |input| (Q4)
+// in addition to the probability we output |delta| (in Q11) used when updating
+// the noise/speech model.
+int32_t WebRtcVad_GaussianProbability(int16_t input,
+                                      int16_t mean,
+                                      int16_t std,
+                                      int16_t* delta) {
+  int16_t tmp16, inv_std, inv_std2, exp_value = 0;
+  int32_t tmp32;
+  // Calculate |inv_std| = 1 / s, in Q10.
+  // 131072 = 1 in Q17, and (|std| >> 1) is for rounding instead of truncation.
+  // Q-domain: Q17 / Q7 = Q10.
+  tmp32 = (int32_t) 131072 + (int32_t) (std >> 1);
+  inv_std = (int16_t) WebRtcSpl_DivW32W16(tmp32, std);
+  // Calculate |inv_std2| = 1 / s^2, in Q14.
+  tmp16 = (inv_std >> 2);  // Q10 -> Q8.
+  // Q-domain: (Q8 * Q8) >> 2 = Q14.
+  inv_std2 = (int16_t)((tmp16 * tmp16) >> 2);
+  // TODO(bjornv): Investigate if changing to
+  // inv_std2 = (int16_t)((inv_std * inv_std) >> 6);
+  // gives better accuracy.
+  tmp16 = (input << 3);  // Q4 -> Q7
+  tmp16 = tmp16 - mean;  // Q7 - Q7 = Q7
+  // To be used later, when updating noise/speech model.
+  // |delta| = (x - m) / s^2, in Q11.
+  // Q-domain: (Q14 * Q7) >> 10 = Q11.
+  *delta = (int16_t)((inv_std2 * tmp16) >> 10);
+  // Calculate the exponent |tmp32| = (x - m)^2 / (2 * s^2), in Q10. Replacing
+  // division by two with one shift.
+  // Q-domain: (Q11 * Q7) >> 8 = Q10.
+  tmp32 = (*delta * tmp16) >> 9;
+  // If the exponent is small enough to give a non-zero probability we calculate
+  // |exp_value| ~= exp(-(x - m)^2 / (2 * s^2))
+  //             ~= exp2(-log2(exp(1)) * |tmp32|).
+  if (tmp32 < kCompVar) {
+    // Calculate |tmp16| = log2(exp(1)) * |tmp32|, in Q10.
+    // Q-domain: (Q12 * Q10) >> 12 = Q10.
+    tmp16 = (int16_t)((kLog2Exp * tmp32) >> 12);
+    tmp16 = -tmp16;
+    exp_value = (0x0400 | (tmp16 & 0x03FF));
+    tmp16 ^= 0xFFFF;
+    tmp16 >>= 10;
+    tmp16 += 1;
+    // Get |exp_value| = exp(-|tmp32|) in Q10.
+    exp_value >>= tmp16;
+  }
+  // Calculate and return (1 / s) * exp(-(x - m)^2 / (2 * s^2)), in Q20.
+  // Q-domain: Q10 * Q10 = Q20.
+  return inv_std * exp_value;
+}

data/ext/webrtcvad/webrtc/common_audio/vad/vad_gmm.h ADDED

@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+// Gaussian probability calculations internally used in vad_core.c.
+#ifndef COMMON_AUDIO_VAD_VAD_GMM_H_
+#define COMMON_AUDIO_VAD_VAD_GMM_H_
+#include <stdint.h>
+// Calculates the probability for |input|, given that |input| comes from a
+// normal distribution with mean and standard deviation (|mean|, |std|).
+//
+// Inputs:
+//      - input         : input sample in Q4.
+//      - mean          : mean input in the statistical model, Q7.
+//      - std           : standard deviation, Q7.
+//
+// Output:
+//
+//      - delta         : input used when updating the model, Q11.
+//                        |delta| = (|input| - |mean|) / |std|^2.
+//
+// Return:
+//   (probability for |input|) =
+//    1 / |std| * exp(-(|input| - |mean|)^2 / (2 * |std|^2));
+int32_t WebRtcVad_GaussianProbability(int16_t input,
+                                      int16_t mean,
+                                      int16_t std,
+                                      int16_t* delta);
+#endif  // COMMON_AUDIO_VAD_VAD_GMM_H_

data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.c ADDED

@@ -0,0 +1,176 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "common_audio/vad/vad_sp.h"
+#include "rtc_base/checks.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+#include "common_audio/vad/vad_core.h"
+// Allpass filter coefficients, upper and lower, in Q13.
+// Upper: 0.64, Lower: 0.17.
+static const int16_t kAllPassCoefsQ13[2] = { 5243, 1392 };  // Q13.
+static const int16_t kSmoothingDown = 6553;  // 0.2 in Q15.
+static const int16_t kSmoothingUp = 32439;  // 0.99 in Q15.
+// TODO(bjornv): Move this function to vad_filterbank.c.
+// Downsampling filter based on splitting filter and allpass functions.
+void WebRtcVad_Downsampling(const int16_t* signal_in,
+                            int16_t* signal_out,
+                            int32_t* filter_state,
+                            size_t in_length) {
+  int16_t tmp16_1 = 0, tmp16_2 = 0;
+  int32_t tmp32_1 = filter_state[0];
+  int32_t tmp32_2 = filter_state[1];
+  size_t n = 0;
+  // Downsampling by 2 gives half length.
+  size_t half_length = (in_length >> 1);
+  // Filter coefficients in Q13, filter state in Q0.
+  for (n = 0; n < half_length; n++) {
+    // All-pass filtering upper branch.
+    tmp16_1 = (int16_t) ((tmp32_1 >> 1) +
+        ((kAllPassCoefsQ13[0] * *signal_in) >> 14));
+    *signal_out = tmp16_1;
+    tmp32_1 = (int32_t)(*signal_in++) - ((kAllPassCoefsQ13[0] * tmp16_1) >> 12);
+    // All-pass filtering lower branch.
+    tmp16_2 = (int16_t) ((tmp32_2 >> 1) +
+        ((kAllPassCoefsQ13[1] * *signal_in) >> 14));
+    *signal_out++ += tmp16_2;
+    tmp32_2 = (int32_t)(*signal_in++) - ((kAllPassCoefsQ13[1] * tmp16_2) >> 12);
+  }
+  // Store the filter states.
+  filter_state[0] = tmp32_1;
+  filter_state[1] = tmp32_2;
+}
+// Inserts |feature_value| into |low_value_vector|, if it is one of the 16
+// smallest values the last 100 frames. Then calculates and returns the median
+// of the five smallest values.
+int16_t WebRtcVad_FindMinimum(VadInstT* self,
+                              int16_t feature_value,
+                              int channel) {
+  int i = 0, j = 0;
+  int position = -1;
+  // Offset to beginning of the 16 minimum values in memory.
+  const int offset = (channel << 4);
+  int16_t current_median = 1600;
+  int16_t alpha = 0;
+  int32_t tmp32 = 0;
+  // Pointer to memory for the 16 minimum values and the age of each value of
+  // the |channel|.
+  int16_t* age = &self->index_vector[offset];
+  int16_t* smallest_values = &self->low_value_vector[offset];
+  RTC_DCHECK_LT(channel, kNumChannels);
+  // Each value in |smallest_values| is getting 1 loop older. Update |age|, and
+  // remove old values.
+  for (i = 0; i < 16; i++) {
+    if (age[i] != 100) {
+      age[i]++;
+    } else {
+      // Too old value. Remove from memory and shift larger values downwards.
+      for (j = i; j < 15; j++) {
+        smallest_values[j] = smallest_values[j + 1];
+        age[j] = age[j + 1];
+      }
+      age[15] = 101;
+      smallest_values[15] = 10000;
+    }
+  }
+  // Check if |feature_value| is smaller than any of the values in
+  // |smallest_values|. If so, find the |position| where to insert the new value
+  // (|feature_value|).
+  if (feature_value < smallest_values[7]) {
+    if (feature_value < smallest_values[3]) {
+      if (feature_value < smallest_values[1]) {
+        if (feature_value < smallest_values[0]) {
+          position = 0;
+        } else {
+          position = 1;
+        }
+      } else if (feature_value < smallest_values[2]) {
+        position = 2;
+      } else {
+        position = 3;
+      }
+    } else if (feature_value < smallest_values[5]) {
+      if (feature_value < smallest_values[4]) {
+        position = 4;
+      } else {
+        position = 5;
+      }
+    } else if (feature_value < smallest_values[6]) {
+      position = 6;
+    } else {
+      position = 7;
+    }
+  } else if (feature_value < smallest_values[15]) {
+    if (feature_value < smallest_values[11]) {
+      if (feature_value < smallest_values[9]) {
+        if (feature_value < smallest_values[8]) {
+          position = 8;
+        } else {
+          position = 9;
+        }
+      } else if (feature_value < smallest_values[10]) {
+        position = 10;
+      } else {
+        position = 11;
+      }
+    } else if (feature_value < smallest_values[13]) {
+      if (feature_value < smallest_values[12]) {
+        position = 12;
+      } else {
+        position = 13;
+      }
+    } else if (feature_value < smallest_values[14]) {
+      position = 14;
+    } else {
+      position = 15;
+    }
+  }
+  // If we have detected a new small value, insert it at the correct position
+  // and shift larger values up.
+  if (position > -1) {
+    for (i = 15; i > position; i--) {
+      smallest_values[i] = smallest_values[i - 1];
+      age[i] = age[i - 1];
+    }
+    smallest_values[position] = feature_value;
+    age[position] = 1;
+  }
+  // Get |current_median|.
+  if (self->frame_counter > 2) {
+    current_median = smallest_values[2];
+  } else if (self->frame_counter > 0) {
+    current_median = smallest_values[0];
+  }
+  // Smooth the median value.
+  if (self->frame_counter > 0) {
+    if (current_median < self->mean_value[channel]) {
+      alpha = kSmoothingDown;  // 0.2 in Q15.
+    } else {
+      alpha = kSmoothingUp;  // 0.99 in Q15.
+    }
+  }
+  tmp32 = (alpha + 1) * self->mean_value[channel];
+  tmp32 += (WEBRTC_SPL_WORD16_MAX - alpha) * current_median;
+  tmp32 += 16384;
+  self->mean_value[channel] = (int16_t) (tmp32 >> 15);
+  return self->mean_value[channel];
+}

data/ext/webrtcvad/webrtc/common_audio/vad/vad_sp.h ADDED

@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+// This file includes specific signal processing tools used in vad_core.c.
+#ifndef COMMON_AUDIO_VAD_VAD_SP_H_
+#define COMMON_AUDIO_VAD_VAD_SP_H_
+#include "common_audio/vad/vad_core.h"
+// Downsamples the signal by a factor 2, eg. 32->16 or 16->8.
+//
+// Inputs:
+//      - signal_in     : Input signal.
+//      - in_length     : Length of input signal in samples.
+//
+// Input & Output:
+//      - filter_state  : Current filter states of the two all-pass filters. The
+//                        |filter_state| is updated after all samples have been
+//                        processed.
+//
+// Output:
+//      - signal_out    : Downsampled signal (of length |in_length| / 2).
+void WebRtcVad_Downsampling(const int16_t* signal_in,
+                            int16_t* signal_out,
+                            int32_t* filter_state,
+                            size_t in_length);
+// Updates and returns the smoothed feature minimum. As minimum we use the
+// median of the five smallest feature values in a 100 frames long window.
+// As long as |handle->frame_counter| is zero, that is, we haven't received any
+// "valid" data, FindMinimum() outputs the default value of 1600.
+//
+// Inputs:
+//      - feature_value : New feature value to update with.
+//      - channel       : Channel number.
+//
+// Input & Output:
+//      - handle        : State information of the VAD.
+//
+// Returns:
+//                      : Smoothed minimum value for a moving window.
+int16_t WebRtcVad_FindMinimum(VadInstT* handle,
+                              int16_t feature_value,
+                              int channel);
+#endif  // COMMON_AUDIO_VAD_VAD_SP_H_

data/ext/webrtcvad/webrtc/common_audio/vad/webrtc_vad.c ADDED

@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "common_audio/vad/include/webrtc_vad.h"
+#include <stdlib.h>
+#include <string.h>
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+#include "common_audio/vad/vad_core.h"
+static const int kInitCheck = 42;
+static const int kValidRates[] = { 8000, 16000, 32000, 48000 };
+static const size_t kRatesSize = sizeof(kValidRates) / sizeof(*kValidRates);
+static const int kMaxFrameLengthMs = 30;
+VadInst* WebRtcVad_Create() {
+  VadInstT* self = (VadInstT*)malloc(sizeof(VadInstT));
+  self->init_flag = 0;
+  return (VadInst*)self;
+}
+void WebRtcVad_Free(VadInst* handle) {
+  free(handle);
+}
+// TODO(bjornv): Move WebRtcVad_InitCore() code here.
+int WebRtcVad_Init(VadInst* handle) {
+  // Initialize the core VAD component.
+  return WebRtcVad_InitCore((VadInstT*) handle);
+}
+// TODO(bjornv): Move WebRtcVad_set_mode_core() code here.
+int WebRtcVad_set_mode(VadInst* handle, int mode) {
+  VadInstT* self = (VadInstT*) handle;
+  if (handle == NULL) {
+    return -1;
+  }
+  if (self->init_flag != kInitCheck) {
+    return -1;
+  }
+  return WebRtcVad_set_mode_core(self, mode);
+}
+int WebRtcVad_Process(VadInst* handle, int fs, const int16_t* audio_frame,
+                      size_t frame_length) {
+  int vad = -1;
+  VadInstT* self = (VadInstT*) handle;
+  if (handle == NULL) {
+    return -1;
+  }
+  if (self->init_flag != kInitCheck) {
+    return -1;
+  }
+  if (audio_frame == NULL) {
+    return -1;
+  }
+  if (WebRtcVad_ValidRateAndFrameLength(fs, frame_length) != 0) {
+    return -1;
+  }
+  if (fs == 48000) {
+      vad = WebRtcVad_CalcVad48khz(self, audio_frame, frame_length);
+  } else if (fs == 32000) {
+    vad = WebRtcVad_CalcVad32khz(self, audio_frame, frame_length);
+  } else if (fs == 16000) {
+    vad = WebRtcVad_CalcVad16khz(self, audio_frame, frame_length);
+  } else if (fs == 8000) {
+    vad = WebRtcVad_CalcVad8khz(self, audio_frame, frame_length);
+  }
+  if (vad > 0) {
+    vad = 1;
+  }
+  return vad;
+}
+int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length) {
+  int return_value = -1;
+  size_t i;
+  int valid_length_ms;
+  size_t valid_length;
+  // We only allow 10, 20 or 30 ms frames. Loop through valid frame rates and
+  // see if we have a matching pair.
+  for (i = 0; i < kRatesSize; i++) {
+    if (kValidRates[i] == rate) {
+      for (valid_length_ms = 10; valid_length_ms <= kMaxFrameLengthMs;
+          valid_length_ms += 10) {
+        valid_length = (size_t)(kValidRates[i] / 1000 * valid_length_ms);
+        if (frame_length == valid_length) {
+          return_value = 0;
+          break;
+        }
+      }
+      break;
+    }
+  }
+  return return_value;
+}