com.adrenak.univoice 4.9.0 → 4.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -2
- package/Runtime/ClientSession.cs +5 -2
- package/Runtime/Common/SimpleVad.cs +379 -0
- package/Runtime/Common/SimpleVad.cs.meta +11 -0
- package/Runtime/Impl/Filters/SimpleVadFilter.cs +17 -0
- package/Runtime/Impl/Filters/SimpleVadFilter.cs.meta +11 -0
- package/Runtime/Impl/Inputs/EmptyAudioInput.cs +2 -0
- package/Samples~/Basic Setup Scripts/UniVoiceFishNetSetupSample.cs +8 -0
- package/Samples~/Basic Setup Scripts/UniVoiceFishNetSetupSample.unity +1 -1
- package/Samples~/Basic Setup Scripts/UniVoiceMirrorSetupSample.cs +8 -0
- package/Samples~/Basic Setup Scripts/UniVoiceMirrorSetupSample.unity +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -24,6 +24,7 @@ Some features of UniVoice:
|
|
|
24
24
|
- Built-in support for:
|
|
25
25
|
- Opus (Concentus) encoding & decoding.
|
|
26
26
|
- RNNoise based noise removal.
|
|
27
|
+
- Energy based VAD (Voice Activity Detection)
|
|
27
28
|
- Gaussian blurring for minor denoising.
|
|
28
29
|
|
|
29
30
|
- 👥 Easy integration with your existing networking solution
|
|
@@ -59,7 +60,7 @@ Ensure you have the NPM registry in the `manifest.json` file of your Unity proje
|
|
|
59
60
|
"com.adrenak.concentus-unity"
|
|
60
61
|
]
|
|
61
62
|
}
|
|
62
|
-
|
|
63
|
+
]
|
|
63
64
|
```
|
|
64
65
|
Then add `com.adrenak.univoice:x.y.z` to the `dependencies` in your `manifest.json` file (where x.y.z is the version you wish to install). The list of versions is available on [the UniVoice NPM page](https://www.npmjs.com/package/com.adrenak.univoice?activeTab=versions).
|
|
65
66
|
|
|
@@ -120,4 +121,5 @@ The author can be reached at the following links:
|
|
|
120
121
|
[LinkedIn](https://www.linkedin.com/in/vatsalAmbastha)
|
|
121
122
|
[GitHub](https://www.github.com/adrenak)
|
|
122
123
|
[Twitter](https://www.twitter.com/vatsalAmbastha)
|
|
123
|
-
|
|
124
|
+
|
|
125
|
+
Discord: `adrenak#1934`
|
package/Runtime/ClientSession.cs
CHANGED
|
@@ -204,11 +204,14 @@ namespace Adrenak.UniVoice {
|
|
|
204
204
|
return;
|
|
205
205
|
|
|
206
206
|
if (InputFilters != null) {
|
|
207
|
-
foreach (var filter in InputFilters)
|
|
207
|
+
foreach (var filter in InputFilters) {
|
|
208
208
|
frame = filter.Run(frame);
|
|
209
|
+
if (frame.samples == null)
|
|
210
|
+
break;
|
|
211
|
+
}
|
|
209
212
|
}
|
|
210
213
|
|
|
211
|
-
if (frame.samples.Length > 0)
|
|
214
|
+
if (frame.samples != null && frame.samples.Length > 0)
|
|
212
215
|
Client.SendAudioFrame(frame);
|
|
213
216
|
};
|
|
214
217
|
}
|
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
using System;
|
|
2
|
+
|
|
3
|
+
namespace Adrenak.UniVoice {
|
|
4
|
+
/// <summary>
|
|
5
|
+
/// A minimal, adaptive voice activity detector operating on time-domain PCM.
|
|
6
|
+
/// Supports float [-1,1] and 16-bit samples, with per-call adaptation to
|
|
7
|
+
/// input frequency and channel count. Multi-channel input is downmixed to mono.
|
|
8
|
+
/// </summary>
|
|
9
|
+
/// <remarks>
|
|
10
|
+
/// The detector emits <see cref="OnVadChanged"/> when the speaking state toggles.
|
|
11
|
+
/// Timings (attack, release, gaps) are maintained in milliseconds and remain
|
|
12
|
+
/// stable across sample-rate changes.
|
|
13
|
+
/// </remarks>
|
|
14
|
+
public class SimpleVad {
|
|
15
|
+
/// <summary>
|
|
16
|
+
/// Configuration for the MiniVad voice activity detector.
|
|
17
|
+
/// All time-based parameters are expressed in milliseconds.
|
|
18
|
+
/// </summary>
|
|
19
|
+
[Serializable]
|
|
20
|
+
public class Config {
|
|
21
|
+
/// <summary>
|
|
22
|
+
/// Target analysis frame duration in milliseconds. The frame sample count
|
|
23
|
+
/// is computed from the current input frequency each call.
|
|
24
|
+
/// </summary>
|
|
25
|
+
public int TargetFrameMs = 20;
|
|
26
|
+
|
|
27
|
+
/// <summary>
|
|
28
|
+
/// Minimum continuous speech duration required to enter the speaking state.
|
|
29
|
+
/// </summary>
|
|
30
|
+
public int AttackMs = 20;
|
|
31
|
+
|
|
32
|
+
/// <summary>
|
|
33
|
+
/// Minimum continuous silence duration required to exit the speaking state.
|
|
34
|
+
/// </summary>
|
|
35
|
+
public int ReleaseMs = 1000;
|
|
36
|
+
|
|
37
|
+
/// <summary>
|
|
38
|
+
/// SNR threshold in decibels used to enter the speaking state.
|
|
39
|
+
/// Higher values make entry stricter.
|
|
40
|
+
/// </summary>
|
|
41
|
+
public float SnrEnterDb = 8f;
|
|
42
|
+
|
|
43
|
+
/// <summary>
|
|
44
|
+
/// SNR threshold in decibels used to remain in the speaking state.
|
|
45
|
+
/// </summary>
|
|
46
|
+
public float SnrExitDb = 4f;
|
|
47
|
+
|
|
48
|
+
/// <summary>
|
|
49
|
+
/// Maximum tolerated duration of consecutive quiet frames while already speaking.
|
|
50
|
+
/// </summary>
|
|
51
|
+
public int MaxGapMs = 300;
|
|
52
|
+
|
|
53
|
+
/// <summary>
|
|
54
|
+
/// Grace period after speech onset during which release is disallowed.
|
|
55
|
+
/// </summary>
|
|
56
|
+
public int NoDropWindowMs = 400;
|
|
57
|
+
|
|
58
|
+
/// <summary>
|
|
59
|
+
/// Noise-floor update rate (EMA alpha) during non-speech.
|
|
60
|
+
/// </summary>
|
|
61
|
+
public float NonSpeechNoiseUpdateRate = 0.01f;
|
|
62
|
+
|
|
63
|
+
/// <summary>
|
|
64
|
+
/// Maximum noise-floor update rate (EMA alpha) during speech.
|
|
65
|
+
/// </summary>
|
|
66
|
+
public float SpeechNoiseUpdateRate = 0.002f;
|
|
67
|
+
|
|
68
|
+
/// <summary>
|
|
69
|
+
/// The minimum allowed value for the estimated noise level (RMS).
|
|
70
|
+
/// Prevents the noise estimate from collapsing toward zero, which
|
|
71
|
+
/// would make SNR calculations unstable or excessively large.
|
|
72
|
+
/// </summary>
|
|
73
|
+
public float MinNoiseRms = 1e-5f;
|
|
74
|
+
|
|
75
|
+
/// <summary>
|
|
76
|
+
/// Energy floor used to clamp extremely low RMS values.
|
|
77
|
+
/// </summary>
|
|
78
|
+
public float EnergyFloor = 1e-5f;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// <summary>
|
|
82
|
+
/// Raised when the VAD speaking state changes.
|
|
83
|
+
/// The event argument is true when entering the speaking state,
|
|
84
|
+
/// and false when exiting.
|
|
85
|
+
/// </summary>
|
|
86
|
+
public event Action<bool> OnVadChanged;
|
|
87
|
+
|
|
88
|
+
/// <summary>
|
|
89
|
+
/// Current speaking state.
|
|
90
|
+
/// </summary>
|
|
91
|
+
public bool IsSpeaking { get; private set; }
|
|
92
|
+
|
|
93
|
+
private readonly Config _config;
|
|
94
|
+
|
|
95
|
+
// Temporary buffer used to collect samples until one full frame is ready.
|
|
96
|
+
private float[] _frameBuf;
|
|
97
|
+
|
|
98
|
+
// Current fill position within the frame buffer.
|
|
99
|
+
private int _frameFill;
|
|
100
|
+
|
|
101
|
+
// Sample rate currently used for frame geometry.
|
|
102
|
+
private int _curSampleRate = -1;
|
|
103
|
+
|
|
104
|
+
// Cached frame size in samples for the current sample rate.
|
|
105
|
+
private int _frameSamples = 0;
|
|
106
|
+
|
|
107
|
+
// Duration (ms) of a single analysis frame at the current sample rate.
|
|
108
|
+
private float _frameDurationMs = 0f;
|
|
109
|
+
|
|
110
|
+
// Current adaptive noise level estimate (RMS). Updated every frame via EMA.
|
|
111
|
+
private float _noiseRms;
|
|
112
|
+
|
|
113
|
+
// Small constant added to denominators to avoid log(0) or division by zero.
|
|
114
|
+
private readonly float _eps = 1e-12f;
|
|
115
|
+
|
|
116
|
+
// Time (ms) of continuous speech detected so far.
|
|
117
|
+
private float _speechMs;
|
|
118
|
+
|
|
119
|
+
// Time (ms) of continuous silence detected so far.
|
|
120
|
+
private float _silenceMs;
|
|
121
|
+
|
|
122
|
+
// Time (ms) since the most recent transition into the speaking state.
|
|
123
|
+
private float _sinceOnsetMs;
|
|
124
|
+
|
|
125
|
+
// Accumulated quiet period (ms) while still considered speaking.
|
|
126
|
+
private float _gapMs;
|
|
127
|
+
|
|
128
|
+
// Warm-up frames during which we only learn noise and disallow onset.
|
|
129
|
+
private int _warmupFrames = 0;
|
|
130
|
+
|
|
131
|
+
/// <summary>
|
|
132
|
+
/// Initializes a new instance of <see cref="SimpleVad"/>.
|
|
133
|
+
/// </summary>
|
|
134
|
+
/// <param name="config">Optional configuration. If null, defaults are used.</param>
|
|
135
|
+
public SimpleVad(Config config = null) {
|
|
136
|
+
_config = config ?? new Config();
|
|
137
|
+
_noiseRms = Math.Max(_config.MinNoiseRms, 5e-3f);
|
|
138
|
+
IsSpeaking = false;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/// <summary>
|
|
142
|
+
/// Ensures internal frame geometry matches the provided frequency.
|
|
143
|
+
/// Recomputes the frame sample count and resets partial-frame state when changed.
|
|
144
|
+
/// </summary>
|
|
145
|
+
/// <param name="frequency">Input sample rate in Hz.</param>
|
|
146
|
+
private void EnsureGeometry(int frequency) {
|
|
147
|
+
if (frequency == _curSampleRate && _frameBuf != null) return;
|
|
148
|
+
|
|
149
|
+
_curSampleRate = frequency;
|
|
150
|
+
|
|
151
|
+
// Choose frame sample count from target frame duration
|
|
152
|
+
int frameSamples = Math.Max(80, (_curSampleRate * _config.TargetFrameMs) / 1000);
|
|
153
|
+
|
|
154
|
+
// Recompute warm-up frames for the new rate: ~200 ms of noise learning
|
|
155
|
+
_warmupFrames = Math.Max(1, (int)Math.Ceiling(200.0 / _config.TargetFrameMs));
|
|
156
|
+
|
|
157
|
+
if (frameSamples != _frameSamples || _frameBuf == null) {
|
|
158
|
+
_frameSamples = frameSamples;
|
|
159
|
+
_frameBuf = new float[_frameSamples];
|
|
160
|
+
_frameFill = 0;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
_frameDurationMs = 1000f * _frameSamples / (float)_curSampleRate;
|
|
164
|
+
|
|
165
|
+
// When geometry changes, reset streaming timers so old partials don't leak across rates
|
|
166
|
+
_speechMs = 0f;
|
|
167
|
+
_silenceMs = 0f;
|
|
168
|
+
_sinceOnsetMs = IsSpeaking ? 0f : _sinceOnsetMs; // safe reset on onset timing
|
|
169
|
+
_gapMs = 0f;
|
|
170
|
+
|
|
171
|
+
_noiseRms = Math.Max(_noiseRms, Math.Max(_config.MinNoiseRms, 5e-3f));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/// <summary>
|
|
175
|
+
/// Resets internal state and timers.
|
|
176
|
+
/// </summary>
|
|
177
|
+
/// <param name="isSpeaking">Initial speaking state after reset.</param>
|
|
178
|
+
public void Reset(bool isSpeaking = false) {
|
|
179
|
+
if (_frameBuf != null)
|
|
180
|
+
Array.Clear(_frameBuf, 0, _frameBuf.Length);
|
|
181
|
+
_frameFill = 0;
|
|
182
|
+
_speechMs = 0f;
|
|
183
|
+
_silenceMs = 0f;
|
|
184
|
+
_sinceOnsetMs = 0f;
|
|
185
|
+
_gapMs = 0f;
|
|
186
|
+
_noiseRms = 5e-3f;
|
|
187
|
+
IsSpeaking = isSpeaking;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/// <summary>
|
|
191
|
+
/// Processes interleaved float PCM in the range [-1, 1] with adaptive
|
|
192
|
+
/// handling of frequency and channels. Multi-channel input is downmixed
|
|
193
|
+
/// to mono via averaging.
|
|
194
|
+
/// </summary>
|
|
195
|
+
/// <param name="frequency">Input sample rate in Hz.</param>
|
|
196
|
+
/// <param name="channels">Number of interleaved channels. If 0, treated as 1 (mono).</param>
|
|
197
|
+
/// <param name="samples">Buffer containing interleaved sample data.</param>
|
|
198
|
+
/// <param name="count">Number of elements from <paramref name="samples"/> to process.</param>
|
|
199
|
+
public void Process(int frequency, int channels, float[] samples, int count) {
|
|
200
|
+
if (samples == null || count <= 0) return;
|
|
201
|
+
if (channels <= 0) channels = 1;
|
|
202
|
+
|
|
203
|
+
// Reconfigure frame geometry if sample rate changed or not initialized
|
|
204
|
+
EnsureGeometry(frequency);
|
|
205
|
+
|
|
206
|
+
// Consume 'count' values which represent count/channels mono samples
|
|
207
|
+
int usable = (count / channels) * channels; // ignore any trailing partial
|
|
208
|
+
int idx = 0;
|
|
209
|
+
|
|
210
|
+
while (idx < usable) {
|
|
211
|
+
// Downmix one interleaved multi-channel sample to mono
|
|
212
|
+
float sum = 0f;
|
|
213
|
+
for (int c = 0; c < channels; c++) {
|
|
214
|
+
sum += samples[idx + c];
|
|
215
|
+
}
|
|
216
|
+
float mono = sum / channels;
|
|
217
|
+
idx += channels;
|
|
218
|
+
|
|
219
|
+
_frameBuf[_frameFill++] = mono;
|
|
220
|
+
|
|
221
|
+
if (_frameFill == _frameBuf.Length) {
|
|
222
|
+
ProcessOneFrame(_frameBuf);
|
|
223
|
+
_frameFill = 0;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/// <summary>
|
|
229
|
+
/// Processes interleaved 16-bit PCM with adaptive handling of frequency and channels.
|
|
230
|
+
/// Multi-channel input is downmixed to mono via averaging.
|
|
231
|
+
/// </summary>
|
|
232
|
+
/// <param name="frequency">Input sample rate in Hz. If 0, a default is used.</param>
|
|
233
|
+
/// <param name="channels">Number of interleaved channels. If 0, treated as 1.</param>
|
|
234
|
+
/// <param name="samples">Buffer containing interleaved sample data.</param>
|
|
235
|
+
/// <param name="count">Number of elements from <paramref name="samples"/> to process.</param>
|
|
236
|
+
public void Process(int frequency, int channels, short[] samples, int count) {
|
|
237
|
+
if (samples == null || count <= 0) return;
|
|
238
|
+
if (channels <= 0) channels = 1;
|
|
239
|
+
|
|
240
|
+
EnsureGeometry(frequency);
|
|
241
|
+
|
|
242
|
+
int usable = (count / channels) * channels;
|
|
243
|
+
int idx = 0;
|
|
244
|
+
|
|
245
|
+
while (idx < usable) {
|
|
246
|
+
int baseIdx = idx;
|
|
247
|
+
float sum = 0f;
|
|
248
|
+
for (int c = 0; c < channels; c++) {
|
|
249
|
+
sum += samples[baseIdx + c] / 32768f;
|
|
250
|
+
}
|
|
251
|
+
float mono = sum / channels;
|
|
252
|
+
idx += channels;
|
|
253
|
+
|
|
254
|
+
_frameBuf[_frameFill++] = mono;
|
|
255
|
+
|
|
256
|
+
if (_frameFill == _frameBuf.Length) {
|
|
257
|
+
ProcessOneFrame(_frameBuf);
|
|
258
|
+
_frameFill = 0;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/// <summary>
|
|
264
|
+
/// Convenience overload for processing fully-filled float buffers.
|
|
265
|
+
/// </summary>
|
|
266
|
+
/// <param name="frequency">Input sample rate in Hz.</param>
|
|
267
|
+
/// <param name="channels">Number of interleaved channels.</param>
|
|
268
|
+
/// <param name="samples">Buffer containing interleaved sample data.</param>
|
|
269
|
+
public void Process(int frequency, int channels, float[] samples)
|
|
270
|
+
=> Process(frequency, channels, samples, samples?.Length ?? 0);
|
|
271
|
+
|
|
272
|
+
/// <summary>
|
|
273
|
+
/// Convenience overload for processing fully-filled 16-bit buffers.
|
|
274
|
+
/// </summary>
|
|
275
|
+
/// <param name="frequency">Input sample rate in Hz.</param>
|
|
276
|
+
/// <param name="channels">Number of interleaved channels.</param>
|
|
277
|
+
/// <param name="samples">Buffer containing interleaved sample data.</param>
|
|
278
|
+
public void Process(int frequency, int channels, short[] samples)
|
|
279
|
+
=> Process(frequency, channels, samples, samples?.Length ?? 0);
|
|
280
|
+
|
|
281
|
+
/// <summary>
|
|
282
|
+
/// Processes a single analysis frame and updates the speaking state.
|
|
283
|
+
/// </summary>
|
|
284
|
+
/// <param name="frame">Mono frame of length equal to the current frame size.</param>
|
|
285
|
+
private void ProcessOneFrame(float[] frame) {
|
|
286
|
+
// --- Energy / RMS ---
|
|
287
|
+
double sumSq = 0;
|
|
288
|
+
for (int i = 0; i < frame.Length; i++) {
|
|
289
|
+
float s = frame[i];
|
|
290
|
+
sumSq += (double)s * s;
|
|
291
|
+
}
|
|
292
|
+
float rms = (float)Math.Sqrt(sumSq / frame.Length);
|
|
293
|
+
rms = Math.Max(rms, _config.EnergyFloor);
|
|
294
|
+
|
|
295
|
+
// --- SNR(dB) vs noise floor ---
|
|
296
|
+
float noise = Math.Max(_noiseRms, _config.MinNoiseRms);
|
|
297
|
+
float snrDb = 20f * (float)Math.Log10((rms + _eps) / (noise + _eps));
|
|
298
|
+
|
|
299
|
+
// Pick threshold depending on current state (hysteresis)
|
|
300
|
+
float threshold = IsSpeaking ? _config.SnrExitDb : _config.SnrEnterDb;
|
|
301
|
+
bool rawSpeech = (snrDb >= threshold) && (rms > _config.EnergyFloor);
|
|
302
|
+
|
|
303
|
+
// Noise EMA: slow during speech, faster during non-speech
|
|
304
|
+
float alpha = rawSpeech ? _config.SpeechNoiseUpdateRate : _config.NonSpeechNoiseUpdateRate;
|
|
305
|
+
_noiseRms = (1f - alpha) * _noiseRms + alpha * rms;
|
|
306
|
+
_noiseRms = Math.Max(_noiseRms, _config.MinNoiseRms);
|
|
307
|
+
|
|
308
|
+
// During warm-up: do not allow entering speaking state.
|
|
309
|
+
// Keep learning noise using the non-speech update rate feel.
|
|
310
|
+
if (_warmupFrames > 0) {
|
|
311
|
+
_warmupFrames--;
|
|
312
|
+
|
|
313
|
+
// Treat this frame as "effective silence" for timers.
|
|
314
|
+
// (We still did the EMA update above, so noise keeps adapting.)
|
|
315
|
+
_silenceMs += _frameDurationMs;
|
|
316
|
+
_speechMs = 0f;
|
|
317
|
+
|
|
318
|
+
// Do not change speaking state during warm-up.
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// --- Gap filling: allow brief quiet while speaking ---
|
|
323
|
+
if (IsSpeaking) {
|
|
324
|
+
if (rawSpeech) _gapMs = 0f;
|
|
325
|
+
else _gapMs += _frameDurationMs;
|
|
326
|
+
}
|
|
327
|
+
else {
|
|
328
|
+
_gapMs = 0f;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// Effective speech used for timers/state:
|
|
332
|
+
bool effectiveSpeech = rawSpeech || (IsSpeaking && _gapMs <= _config.MaxGapMs);
|
|
333
|
+
|
|
334
|
+
// --- Time-based hangover & no-drop window ---
|
|
335
|
+
if (effectiveSpeech) {
|
|
336
|
+
_speechMs += _frameDurationMs;
|
|
337
|
+
_silenceMs = 0f;
|
|
338
|
+
}
|
|
339
|
+
else {
|
|
340
|
+
_silenceMs += _frameDurationMs;
|
|
341
|
+
_speechMs = 0f;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
bool newIsSpeaking = IsSpeaking;
|
|
345
|
+
|
|
346
|
+
// Enter speaking after AttackMs of continuous effectiveSpeech
|
|
347
|
+
if (!IsSpeaking && _speechMs >= _config.AttackMs) {
|
|
348
|
+
newIsSpeaking = true;
|
|
349
|
+
_sinceOnsetMs = 0f; // reset onset timer when we flip on
|
|
350
|
+
_gapMs = 0f;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// Update onset timer if speaking
|
|
354
|
+
if (newIsSpeaking) _sinceOnsetMs += _frameDurationMs;
|
|
355
|
+
|
|
356
|
+
// Exit only if:
|
|
357
|
+
// 1) we've accumulated ReleaseMs of effective silence AND
|
|
358
|
+
// 2) we're past the initial NoDropWindow
|
|
359
|
+
if (IsSpeaking && _silenceMs >= _config.ReleaseMs && _sinceOnsetMs >= _config.NoDropWindowMs) {
|
|
360
|
+
newIsSpeaking = false;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
if (newIsSpeaking != IsSpeaking) {
|
|
364
|
+
IsSpeaking = newIsSpeaking;
|
|
365
|
+
OnVadChanged?.Invoke(IsSpeaking);
|
|
366
|
+
// reset timers appropriately
|
|
367
|
+
if (IsSpeaking) {
|
|
368
|
+
_sinceOnsetMs = 0f;
|
|
369
|
+
_gapMs = 0f;
|
|
370
|
+
}
|
|
371
|
+
else {
|
|
372
|
+
_silenceMs = 0f;
|
|
373
|
+
_speechMs = 0f;
|
|
374
|
+
_gapMs = 0f;
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
namespace Adrenak.UniVoice.Filters {
|
|
2
|
+
public class SimpleVadFilter : IAudioFilter {
|
|
3
|
+
private readonly SimpleVad _vad;
|
|
4
|
+
|
|
5
|
+
public SimpleVadFilter(SimpleVad vad) {
|
|
6
|
+
_vad = vad;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
public AudioFrame Run(AudioFrame input) {
|
|
10
|
+
_vad.Process(input.frequency, input.channelCount, Utils.Bytes.BytesToFloats(input.samples));
|
|
11
|
+
if (_vad.IsSpeaking) {
|
|
12
|
+
return input;
|
|
13
|
+
}
|
|
14
|
+
return default;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
}
|
|
@@ -51,6 +51,8 @@ namespace Adrenak.UniVoice.Samples {
|
|
|
51
51
|
|
|
52
52
|
[SerializeField] bool useConcentusEncodeAndDecode = true;
|
|
53
53
|
|
|
54
|
+
[SerializeField] bool useVad = true;
|
|
55
|
+
|
|
54
56
|
void Start() {
|
|
55
57
|
if (HasSetUp) {
|
|
56
58
|
Debug.unityLogger.Log(LogType.Log, TAG, "UniVoice is already set up. Ignoring...");
|
|
@@ -179,6 +181,12 @@ namespace Adrenak.UniVoice.Samples {
|
|
|
179
181
|
}
|
|
180
182
|
#endif
|
|
181
183
|
|
|
184
|
+
if (useVad) {
|
|
185
|
+
// We add the VAD filter after RNNoise.
|
|
186
|
+
// This way lot of the background noise has been removed, VAD is truly trying to detect voice
|
|
187
|
+
ClientSession.InputFilters.Add(new SimpleVadFilter(new SimpleVad()));
|
|
188
|
+
}
|
|
189
|
+
|
|
182
190
|
if (useConcentusEncodeAndDecode) {
|
|
183
191
|
// ConcentureEncoder filter to encode captured audio that reduces the audio frame size
|
|
184
192
|
ClientSession.InputFilters.Add(new ConcentusEncodeFilter());
|
|
@@ -38,7 +38,6 @@ RenderSettings:
|
|
|
38
38
|
m_ReflectionIntensity: 1
|
|
39
39
|
m_CustomReflection: {fileID: 0}
|
|
40
40
|
m_Sun: {fileID: 0}
|
|
41
|
-
m_IndirectSpecularColor: {r: 0.37311926, g: 0.38073996, b: 0.35872698, a: 1}
|
|
42
41
|
m_UseRadianceAmbientProbe: 0
|
|
43
42
|
--- !u!157 &3
|
|
44
43
|
LightmapSettings:
|
|
@@ -154,6 +153,7 @@ MonoBehaviour:
|
|
|
154
153
|
m_EditorClassIdentifier:
|
|
155
154
|
useRNNoise4UnityIfAvailable: 1
|
|
156
155
|
useConcentusEncodeAndDecode: 1
|
|
156
|
+
useVad: 1
|
|
157
157
|
--- !u!4 &810914623
|
|
158
158
|
Transform:
|
|
159
159
|
m_ObjectHideFlags: 0
|
|
@@ -51,6 +51,8 @@ namespace Adrenak.UniVoice.Samples {
|
|
|
51
51
|
|
|
52
52
|
[SerializeField] bool useConcentusEncodeAndDecode = true;
|
|
53
53
|
|
|
54
|
+
[SerializeField] bool useVad = true;
|
|
55
|
+
|
|
54
56
|
void Start() {
|
|
55
57
|
if (HasSetUp) {
|
|
56
58
|
Debug.unityLogger.Log(LogType.Log, TAG, "UniVoice is already set up. Ignoring...");
|
|
@@ -179,6 +181,12 @@ namespace Adrenak.UniVoice.Samples {
|
|
|
179
181
|
}
|
|
180
182
|
#endif
|
|
181
183
|
|
|
184
|
+
if (useVad) {
|
|
185
|
+
// We add the VAD filter after RNNoise.
|
|
186
|
+
// This way lot of the background noise has been removed, VAD is truly trying to detect voice
|
|
187
|
+
ClientSession.InputFilters.Add(new SimpleVadFilter(new SimpleVad()));
|
|
188
|
+
}
|
|
189
|
+
|
|
182
190
|
if (useConcentusEncodeAndDecode) {
|
|
183
191
|
// ConcentureEncoder filter to encode captured audio that reduces the audio frame size
|
|
184
192
|
ClientSession.InputFilters.Add(new ConcentusEncodeFilter());
|
|
@@ -38,7 +38,6 @@ RenderSettings:
|
|
|
38
38
|
m_ReflectionIntensity: 1
|
|
39
39
|
m_CustomReflection: {fileID: 0}
|
|
40
40
|
m_Sun: {fileID: 0}
|
|
41
|
-
m_IndirectSpecularColor: {r: 0.44657874, g: 0.49641275, b: 0.5748172, a: 1}
|
|
42
41
|
m_UseRadianceAmbientProbe: 0
|
|
43
42
|
--- !u!157 &3
|
|
44
43
|
LightmapSettings:
|
|
@@ -419,6 +418,7 @@ MonoBehaviour:
|
|
|
419
418
|
m_EditorClassIdentifier:
|
|
420
419
|
useRNNoise4UnityIfAvailable: 1
|
|
421
420
|
useConcentusEncodeAndDecode: 1
|
|
421
|
+
useVad: 1
|
|
422
422
|
--- !u!114 &1902381431
|
|
423
423
|
MonoBehaviour:
|
|
424
424
|
m_ObjectHideFlags: 0
|