com.adrenak.univoice 4.9.0 → 4.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -24,6 +24,7 @@ Some features of UniVoice:
24
24
  - Built-in support for:
25
25
  - Opus (Concentus) encoding & decoding.
26
26
  - RNNoise based noise removal.
27
+ - Energy based VAD (Voice Activity Detection)
27
28
  - Gaussian blurring for minor denoising.
28
29
 
29
30
  - 👥 Easy integration with your existing networking solution
@@ -59,7 +60,7 @@ Ensure you have the NPM registry in the `manifest.json` file of your Unity proje
59
60
  "com.adrenak.concentus-unity"
60
61
  ]
61
62
  }
62
- }
63
+ ]
63
64
  ```
64
65
  Then add `com.adrenak.univoice:x.y.z` to the `dependencies` in your `manifest.json` file (where x.y.z is the version you wish to install). The list of versions is available on [the UniVoice NPM page](https://www.npmjs.com/package/com.adrenak.univoice?activeTab=versions).
65
66
 
@@ -120,4 +121,5 @@ The author can be reached at the following links:
120
121
  [LinkedIn](https://www.linkedin.com/in/vatsalAmbastha)
121
122
  [GitHub](https://www.github.com/adrenak)
122
123
  [Twitter](https://www.twitter.com/vatsalAmbastha)
123
- Discord: `adrenak#1934`
124
+
125
+ Discord: `adrenak#1934`
@@ -204,11 +204,14 @@ namespace Adrenak.UniVoice {
204
204
  return;
205
205
 
206
206
  if (InputFilters != null) {
207
- foreach (var filter in InputFilters)
207
+ foreach (var filter in InputFilters) {
208
208
  frame = filter.Run(frame);
209
+ if (frame.samples == null)
210
+ break;
211
+ }
209
212
  }
210
213
 
211
- if (frame.samples.Length > 0)
214
+ if (frame.samples != null && frame.samples.Length > 0)
212
215
  Client.SendAudioFrame(frame);
213
216
  };
214
217
  }
@@ -0,0 +1,379 @@
1
+ using System;
2
+
3
+ namespace Adrenak.UniVoice {
4
+ /// <summary>
5
+ /// A minimal, adaptive voice activity detector operating on time-domain PCM.
6
+ /// Supports float [-1,1] and 16-bit samples, with per-call adaptation to
7
+ /// input frequency and channel count. Multi-channel input is downmixed to mono.
8
+ /// </summary>
9
+ /// <remarks>
10
+ /// The detector emits <see cref="OnVadChanged"/> when the speaking state toggles.
11
+ /// Timings (attack, release, gaps) are maintained in milliseconds and remain
12
+ /// stable across sample-rate changes.
13
+ /// </remarks>
14
+ public class SimpleVad {
15
+ /// <summary>
16
+ /// Configuration for the MiniVad voice activity detector.
17
+ /// All time-based parameters are expressed in milliseconds.
18
+ /// </summary>
19
+ [Serializable]
20
+ public class Config {
21
+ /// <summary>
22
+ /// Target analysis frame duration in milliseconds. The frame sample count
23
+ /// is computed from the current input frequency each call.
24
+ /// </summary>
25
+ public int TargetFrameMs = 20;
26
+
27
+ /// <summary>
28
+ /// Minimum continuous speech duration required to enter the speaking state.
29
+ /// </summary>
30
+ public int AttackMs = 20;
31
+
32
+ /// <summary>
33
+ /// Minimum continuous silence duration required to exit the speaking state.
34
+ /// </summary>
35
+ public int ReleaseMs = 1000;
36
+
37
+ /// <summary>
38
+ /// SNR threshold in decibels used to enter the speaking state.
39
+ /// Higher values make entry stricter.
40
+ /// </summary>
41
+ public float SnrEnterDb = 8f;
42
+
43
+ /// <summary>
44
+ /// SNR threshold in decibels used to remain in the speaking state.
45
+ /// </summary>
46
+ public float SnrExitDb = 4f;
47
+
48
+ /// <summary>
49
+ /// Maximum tolerated duration of consecutive quiet frames while already speaking.
50
+ /// </summary>
51
+ public int MaxGapMs = 300;
52
+
53
+ /// <summary>
54
+ /// Grace period after speech onset during which release is disallowed.
55
+ /// </summary>
56
+ public int NoDropWindowMs = 400;
57
+
58
+ /// <summary>
59
+ /// Noise-floor update rate (EMA alpha) during non-speech.
60
+ /// </summary>
61
+ public float NonSpeechNoiseUpdateRate = 0.01f;
62
+
63
+ /// <summary>
64
+ /// Maximum noise-floor update rate (EMA alpha) during speech.
65
+ /// </summary>
66
+ public float SpeechNoiseUpdateRate = 0.002f;
67
+
68
+ /// <summary>
69
+ /// The minimum allowed value for the estimated noise level (RMS).
70
+ /// Prevents the noise estimate from collapsing toward zero, which
71
+ /// would make SNR calculations unstable or excessively large.
72
+ /// </summary>
73
+ public float MinNoiseRms = 1e-5f;
74
+
75
+ /// <summary>
76
+ /// Energy floor used to clamp extremely low RMS values.
77
+ /// </summary>
78
+ public float EnergyFloor = 1e-5f;
79
+ }
80
+
81
+ /// <summary>
82
+ /// Raised when the VAD speaking state changes.
83
+ /// The event argument is true when entering the speaking state,
84
+ /// and false when exiting.
85
+ /// </summary>
86
+ public event Action<bool> OnVadChanged;
87
+
88
+ /// <summary>
89
+ /// Current speaking state.
90
+ /// </summary>
91
+ public bool IsSpeaking { get; private set; }
92
+
93
+ private readonly Config _config;
94
+
95
+ // Temporary buffer used to collect samples until one full frame is ready.
96
+ private float[] _frameBuf;
97
+
98
+ // Current fill position within the frame buffer.
99
+ private int _frameFill;
100
+
101
+ // Sample rate currently used for frame geometry.
102
+ private int _curSampleRate = -1;
103
+
104
+ // Cached frame size in samples for the current sample rate.
105
+ private int _frameSamples = 0;
106
+
107
+ // Duration (ms) of a single analysis frame at the current sample rate.
108
+ private float _frameDurationMs = 0f;
109
+
110
+ // Current adaptive noise level estimate (RMS). Updated every frame via EMA.
111
+ private float _noiseRms;
112
+
113
+ // Small constant added to denominators to avoid log(0) or division by zero.
114
+ private readonly float _eps = 1e-12f;
115
+
116
+ // Time (ms) of continuous speech detected so far.
117
+ private float _speechMs;
118
+
119
+ // Time (ms) of continuous silence detected so far.
120
+ private float _silenceMs;
121
+
122
+ // Time (ms) since the most recent transition into the speaking state.
123
+ private float _sinceOnsetMs;
124
+
125
+ // Accumulated quiet period (ms) while still considered speaking.
126
+ private float _gapMs;
127
+
128
+ // Warm-up frames during which we only learn noise and disallow onset.
129
+ private int _warmupFrames = 0;
130
+
131
+ /// <summary>
132
+ /// Initializes a new instance of <see cref="SimpleVad"/>.
133
+ /// </summary>
134
+ /// <param name="config">Optional configuration. If null, defaults are used.</param>
135
+ public SimpleVad(Config config = null) {
136
+ _config = config ?? new Config();
137
+ _noiseRms = Math.Max(_config.MinNoiseRms, 5e-3f);
138
+ IsSpeaking = false;
139
+ }
140
+
141
+ /// <summary>
142
+ /// Ensures internal frame geometry matches the provided frequency.
143
+ /// Recomputes the frame sample count and resets partial-frame state when changed.
144
+ /// </summary>
145
+ /// <param name="frequency">Input sample rate in Hz.</param>
146
+ private void EnsureGeometry(int frequency) {
147
+ if (frequency == _curSampleRate && _frameBuf != null) return;
148
+
149
+ _curSampleRate = frequency;
150
+
151
+ // Choose frame sample count from target frame duration
152
+ int frameSamples = Math.Max(80, (_curSampleRate * _config.TargetFrameMs) / 1000);
153
+
154
+ // Recompute warm-up frames for the new rate: ~200 ms of noise learning
155
+ _warmupFrames = Math.Max(1, (int)Math.Ceiling(200.0 / _config.TargetFrameMs));
156
+
157
+ if (frameSamples != _frameSamples || _frameBuf == null) {
158
+ _frameSamples = frameSamples;
159
+ _frameBuf = new float[_frameSamples];
160
+ _frameFill = 0;
161
+ }
162
+
163
+ _frameDurationMs = 1000f * _frameSamples / (float)_curSampleRate;
164
+
165
+ // When geometry changes, reset streaming timers so old partials don't leak across rates
166
+ _speechMs = 0f;
167
+ _silenceMs = 0f;
168
+ _sinceOnsetMs = IsSpeaking ? 0f : _sinceOnsetMs; // safe reset on onset timing
169
+ _gapMs = 0f;
170
+
171
+ _noiseRms = Math.Max(_noiseRms, Math.Max(_config.MinNoiseRms, 5e-3f));
172
+ }
173
+
174
+ /// <summary>
175
+ /// Resets internal state and timers.
176
+ /// </summary>
177
+ /// <param name="isSpeaking">Initial speaking state after reset.</param>
178
+ public void Reset(bool isSpeaking = false) {
179
+ if (_frameBuf != null)
180
+ Array.Clear(_frameBuf, 0, _frameBuf.Length);
181
+ _frameFill = 0;
182
+ _speechMs = 0f;
183
+ _silenceMs = 0f;
184
+ _sinceOnsetMs = 0f;
185
+ _gapMs = 0f;
186
+ _noiseRms = 5e-3f;
187
+ IsSpeaking = isSpeaking;
188
+ }
189
+
190
+ /// <summary>
191
+ /// Processes interleaved float PCM in the range [-1, 1] with adaptive
192
+ /// handling of frequency and channels. Multi-channel input is downmixed
193
+ /// to mono via averaging.
194
+ /// </summary>
195
+ /// <param name="frequency">Input sample rate in Hz.</param>
196
+ /// <param name="channels">Number of interleaved channels. If 0, treated as 1 (mono).</param>
197
+ /// <param name="samples">Buffer containing interleaved sample data.</param>
198
+ /// <param name="count">Number of elements from <paramref name="samples"/> to process.</param>
199
+ public void Process(int frequency, int channels, float[] samples, int count) {
200
+ if (samples == null || count <= 0) return;
201
+ if (channels <= 0) channels = 1;
202
+
203
+ // Reconfigure frame geometry if sample rate changed or not initialized
204
+ EnsureGeometry(frequency);
205
+
206
+ // Consume 'count' values which represent count/channels mono samples
207
+ int usable = (count / channels) * channels; // ignore any trailing partial
208
+ int idx = 0;
209
+
210
+ while (idx < usable) {
211
+ // Downmix one interleaved multi-channel sample to mono
212
+ float sum = 0f;
213
+ for (int c = 0; c < channels; c++) {
214
+ sum += samples[idx + c];
215
+ }
216
+ float mono = sum / channels;
217
+ idx += channels;
218
+
219
+ _frameBuf[_frameFill++] = mono;
220
+
221
+ if (_frameFill == _frameBuf.Length) {
222
+ ProcessOneFrame(_frameBuf);
223
+ _frameFill = 0;
224
+ }
225
+ }
226
+ }
227
+
228
+ /// <summary>
229
+ /// Processes interleaved 16-bit PCM with adaptive handling of frequency and channels.
230
+ /// Multi-channel input is downmixed to mono via averaging.
231
+ /// </summary>
232
+ /// <param name="frequency">Input sample rate in Hz. If 0, a default is used.</param>
233
+ /// <param name="channels">Number of interleaved channels. If 0, treated as 1.</param>
234
+ /// <param name="samples">Buffer containing interleaved sample data.</param>
235
+ /// <param name="count">Number of elements from <paramref name="samples"/> to process.</param>
236
+ public void Process(int frequency, int channels, short[] samples, int count) {
237
+ if (samples == null || count <= 0) return;
238
+ if (channels <= 0) channels = 1;
239
+
240
+ EnsureGeometry(frequency);
241
+
242
+ int usable = (count / channels) * channels;
243
+ int idx = 0;
244
+
245
+ while (idx < usable) {
246
+ int baseIdx = idx;
247
+ float sum = 0f;
248
+ for (int c = 0; c < channels; c++) {
249
+ sum += samples[baseIdx + c] / 32768f;
250
+ }
251
+ float mono = sum / channels;
252
+ idx += channels;
253
+
254
+ _frameBuf[_frameFill++] = mono;
255
+
256
+ if (_frameFill == _frameBuf.Length) {
257
+ ProcessOneFrame(_frameBuf);
258
+ _frameFill = 0;
259
+ }
260
+ }
261
+ }
262
+
263
+ /// <summary>
264
+ /// Convenience overload for processing fully-filled float buffers.
265
+ /// </summary>
266
+ /// <param name="frequency">Input sample rate in Hz.</param>
267
+ /// <param name="channels">Number of interleaved channels.</param>
268
+ /// <param name="samples">Buffer containing interleaved sample data.</param>
269
+ public void Process(int frequency, int channels, float[] samples)
270
+ => Process(frequency, channels, samples, samples?.Length ?? 0);
271
+
272
+ /// <summary>
273
+ /// Convenience overload for processing fully-filled 16-bit buffers.
274
+ /// </summary>
275
+ /// <param name="frequency">Input sample rate in Hz.</param>
276
+ /// <param name="channels">Number of interleaved channels.</param>
277
+ /// <param name="samples">Buffer containing interleaved sample data.</param>
278
+ public void Process(int frequency, int channels, short[] samples)
279
+ => Process(frequency, channels, samples, samples?.Length ?? 0);
280
+
281
+ /// <summary>
282
+ /// Processes a single analysis frame and updates the speaking state.
283
+ /// </summary>
284
+ /// <param name="frame">Mono frame of length equal to the current frame size.</param>
285
+ private void ProcessOneFrame(float[] frame) {
286
+ // --- Energy / RMS ---
287
+ double sumSq = 0;
288
+ for (int i = 0; i < frame.Length; i++) {
289
+ float s = frame[i];
290
+ sumSq += (double)s * s;
291
+ }
292
+ float rms = (float)Math.Sqrt(sumSq / frame.Length);
293
+ rms = Math.Max(rms, _config.EnergyFloor);
294
+
295
+ // --- SNR(dB) vs noise floor ---
296
+ float noise = Math.Max(_noiseRms, _config.MinNoiseRms);
297
+ float snrDb = 20f * (float)Math.Log10((rms + _eps) / (noise + _eps));
298
+
299
+ // Pick threshold depending on current state (hysteresis)
300
+ float threshold = IsSpeaking ? _config.SnrExitDb : _config.SnrEnterDb;
301
+ bool rawSpeech = (snrDb >= threshold) && (rms > _config.EnergyFloor);
302
+
303
+ // Noise EMA: slow during speech, faster during non-speech
304
+ float alpha = rawSpeech ? _config.SpeechNoiseUpdateRate : _config.NonSpeechNoiseUpdateRate;
305
+ _noiseRms = (1f - alpha) * _noiseRms + alpha * rms;
306
+ _noiseRms = Math.Max(_noiseRms, _config.MinNoiseRms);
307
+
308
+ // During warm-up: do not allow entering speaking state.
309
+ // Keep learning noise using the non-speech update rate feel.
310
+ if (_warmupFrames > 0) {
311
+ _warmupFrames--;
312
+
313
+ // Treat this frame as "effective silence" for timers.
314
+ // (We still did the EMA update above, so noise keeps adapting.)
315
+ _silenceMs += _frameDurationMs;
316
+ _speechMs = 0f;
317
+
318
+ // Do not change speaking state during warm-up.
319
+ return;
320
+ }
321
+
322
+ // --- Gap filling: allow brief quiet while speaking ---
323
+ if (IsSpeaking) {
324
+ if (rawSpeech) _gapMs = 0f;
325
+ else _gapMs += _frameDurationMs;
326
+ }
327
+ else {
328
+ _gapMs = 0f;
329
+ }
330
+
331
+ // Effective speech used for timers/state:
332
+ bool effectiveSpeech = rawSpeech || (IsSpeaking && _gapMs <= _config.MaxGapMs);
333
+
334
+ // --- Time-based hangover & no-drop window ---
335
+ if (effectiveSpeech) {
336
+ _speechMs += _frameDurationMs;
337
+ _silenceMs = 0f;
338
+ }
339
+ else {
340
+ _silenceMs += _frameDurationMs;
341
+ _speechMs = 0f;
342
+ }
343
+
344
+ bool newIsSpeaking = IsSpeaking;
345
+
346
+ // Enter speaking after AttackMs of continuous effectiveSpeech
347
+ if (!IsSpeaking && _speechMs >= _config.AttackMs) {
348
+ newIsSpeaking = true;
349
+ _sinceOnsetMs = 0f; // reset onset timer when we flip on
350
+ _gapMs = 0f;
351
+ }
352
+
353
+ // Update onset timer if speaking
354
+ if (newIsSpeaking) _sinceOnsetMs += _frameDurationMs;
355
+
356
+ // Exit only if:
357
+ // 1) we've accumulated ReleaseMs of effective silence AND
358
+ // 2) we're past the initial NoDropWindow
359
+ if (IsSpeaking && _silenceMs >= _config.ReleaseMs && _sinceOnsetMs >= _config.NoDropWindowMs) {
360
+ newIsSpeaking = false;
361
+ }
362
+
363
+ if (newIsSpeaking != IsSpeaking) {
364
+ IsSpeaking = newIsSpeaking;
365
+ OnVadChanged?.Invoke(IsSpeaking);
366
+ // reset timers appropriately
367
+ if (IsSpeaking) {
368
+ _sinceOnsetMs = 0f;
369
+ _gapMs = 0f;
370
+ }
371
+ else {
372
+ _silenceMs = 0f;
373
+ _speechMs = 0f;
374
+ _gapMs = 0f;
375
+ }
376
+ }
377
+ }
378
+ }
379
+ }
@@ -0,0 +1,11 @@
1
+ fileFormatVersion: 2
2
+ guid: 016b8c73d9808cc49a425fb3d7acabed
3
+ MonoImporter:
4
+ externalObjects: {}
5
+ serializedVersion: 2
6
+ defaultReferences: []
7
+ executionOrder: 0
8
+ icon: {instanceID: 0}
9
+ userData:
10
+ assetBundleName:
11
+ assetBundleVariant:
@@ -0,0 +1,17 @@
1
+ namespace Adrenak.UniVoice.Filters {
2
+ public class SimpleVadFilter : IAudioFilter {
3
+ private readonly SimpleVad _vad;
4
+
5
+ public SimpleVadFilter(SimpleVad vad) {
6
+ _vad = vad;
7
+ }
8
+
9
+ public AudioFrame Run(AudioFrame input) {
10
+ _vad.Process(input.frequency, input.channelCount, Utils.Bytes.BytesToFloats(input.samples));
11
+ if (_vad.IsSpeaking) {
12
+ return input;
13
+ }
14
+ return default;
15
+ }
16
+ }
17
+ }
@@ -0,0 +1,11 @@
1
+ fileFormatVersion: 2
2
+ guid: abdb23219540abe46ba175163b23e00d
3
+ MonoImporter:
4
+ externalObjects: {}
5
+ serializedVersion: 2
6
+ defaultReferences: []
7
+ executionOrder: 0
8
+ icon: {instanceID: 0}
9
+ userData:
10
+ assetBundleName:
11
+ assetBundleVariant:
@@ -15,7 +15,9 @@ namespace Adrenak.UniVoice {
15
15
 
16
16
  public int SegmentRate => 1;
17
17
 
18
+ #pragma warning disable CS0067
18
19
  public event Action<AudioFrame> OnFrameReady;
20
+ #pragma warning restore
19
21
 
20
22
  public void Dispose() { }
21
23
  }
@@ -51,6 +51,8 @@ namespace Adrenak.UniVoice.Samples {
51
51
 
52
52
  [SerializeField] bool useConcentusEncodeAndDecode = true;
53
53
 
54
+ [SerializeField] bool useVad = true;
55
+
54
56
  void Start() {
55
57
  if (HasSetUp) {
56
58
  Debug.unityLogger.Log(LogType.Log, TAG, "UniVoice is already set up. Ignoring...");
@@ -179,6 +181,12 @@ namespace Adrenak.UniVoice.Samples {
179
181
  }
180
182
  #endif
181
183
 
184
+ if (useVad) {
185
+ // We add the VAD filter after RNNoise.
186
+ // This way lot of the background noise has been removed, VAD is truly trying to detect voice
187
+ ClientSession.InputFilters.Add(new SimpleVadFilter(new SimpleVad()));
188
+ }
189
+
182
190
  if (useConcentusEncodeAndDecode) {
183
191
  // ConcentureEncoder filter to encode captured audio that reduces the audio frame size
184
192
  ClientSession.InputFilters.Add(new ConcentusEncodeFilter());
@@ -38,7 +38,6 @@ RenderSettings:
38
38
  m_ReflectionIntensity: 1
39
39
  m_CustomReflection: {fileID: 0}
40
40
  m_Sun: {fileID: 0}
41
- m_IndirectSpecularColor: {r: 0.37311926, g: 0.38073996, b: 0.35872698, a: 1}
42
41
  m_UseRadianceAmbientProbe: 0
43
42
  --- !u!157 &3
44
43
  LightmapSettings:
@@ -154,6 +153,7 @@ MonoBehaviour:
154
153
  m_EditorClassIdentifier:
155
154
  useRNNoise4UnityIfAvailable: 1
156
155
  useConcentusEncodeAndDecode: 1
156
+ useVad: 1
157
157
  --- !u!4 &810914623
158
158
  Transform:
159
159
  m_ObjectHideFlags: 0
@@ -51,6 +51,8 @@ namespace Adrenak.UniVoice.Samples {
51
51
 
52
52
  [SerializeField] bool useConcentusEncodeAndDecode = true;
53
53
 
54
+ [SerializeField] bool useVad = true;
55
+
54
56
  void Start() {
55
57
  if (HasSetUp) {
56
58
  Debug.unityLogger.Log(LogType.Log, TAG, "UniVoice is already set up. Ignoring...");
@@ -179,6 +181,12 @@ namespace Adrenak.UniVoice.Samples {
179
181
  }
180
182
  #endif
181
183
 
184
+ if (useVad) {
185
+ // We add the VAD filter after RNNoise.
186
+ // This way lot of the background noise has been removed, VAD is truly trying to detect voice
187
+ ClientSession.InputFilters.Add(new SimpleVadFilter(new SimpleVad()));
188
+ }
189
+
182
190
  if (useConcentusEncodeAndDecode) {
183
191
  // ConcentureEncoder filter to encode captured audio that reduces the audio frame size
184
192
  ClientSession.InputFilters.Add(new ConcentusEncodeFilter());
@@ -38,7 +38,6 @@ RenderSettings:
38
38
  m_ReflectionIntensity: 1
39
39
  m_CustomReflection: {fileID: 0}
40
40
  m_Sun: {fileID: 0}
41
- m_IndirectSpecularColor: {r: 0.44657874, g: 0.49641275, b: 0.5748172, a: 1}
42
41
  m_UseRadianceAmbientProbe: 0
43
42
  --- !u!157 &3
44
43
  LightmapSettings:
@@ -419,6 +418,7 @@ MonoBehaviour:
419
418
  m_EditorClassIdentifier:
420
419
  useRNNoise4UnityIfAvailable: 1
421
420
  useConcentusEncodeAndDecode: 1
421
+ useVad: 1
422
422
  --- !u!114 &1902381431
423
423
  MonoBehaviour:
424
424
  m_ObjectHideFlags: 0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "com.adrenak.univoice",
3
- "version": "4.9.0",
3
+ "version": "4.10.1",
4
4
  "displayName": "Adrenak.UniVoice",
5
5
  "description": "Voice chat/VoIP framework for Unity.",
6
6
  "unity": "2021.2",
@@ -33,7 +33,7 @@
33
33
  ],
34
34
  "dependencies": {
35
35
  "com.adrenak.brw": "1.0.1",
36
- "com.adrenak.unimic": "3.3.0",
36
+ "com.adrenak.unimic": "3.4.0",
37
37
  "com.adrenak.concentus-unity": "1.0.1"
38
38
  }
39
39
  }