osense-vad 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +21 -0
- package/README.md +403 -0
- package/dist/common/asset-path.d.ts +2 -0
- package/dist/common/asset-path.d.ts.map +1 -0
- package/dist/common/asset-path.js +17 -0
- package/dist/common/asset-path.js.map +1 -0
- package/dist/common/default-model-fetcher.d.ts +2 -0
- package/dist/common/default-model-fetcher.d.ts.map +1 -0
- package/dist/common/default-model-fetcher.js +8 -0
- package/dist/common/default-model-fetcher.js.map +1 -0
- package/dist/common/frame-processor.d.ts +86 -0
- package/dist/common/frame-processor.d.ts.map +1 -0
- package/dist/common/frame-processor.js +180 -0
- package/dist/common/frame-processor.js.map +1 -0
- package/dist/common/index.d.ts +13 -0
- package/dist/common/index.d.ts.map +1 -0
- package/dist/common/index.js +53 -0
- package/dist/common/index.js.map +1 -0
- package/dist/common/logging.d.ts +8 -0
- package/dist/common/logging.d.ts.map +1 -0
- package/dist/common/logging.js +16 -0
- package/dist/common/logging.js.map +1 -0
- package/dist/common/messages.d.ts +10 -0
- package/dist/common/messages.d.ts.map +1 -0
- package/dist/common/messages.js +14 -0
- package/dist/common/messages.js.map +1 -0
- package/dist/common/models/common.d.ts +14 -0
- package/dist/common/models/common.d.ts.map +1 -0
- package/dist/common/models/common.js +3 -0
- package/dist/common/models/common.js.map +1 -0
- package/dist/common/models/index.d.ts +9 -0
- package/dist/common/models/index.d.ts.map +1 -0
- package/dist/common/models/index.js +11 -0
- package/dist/common/models/index.js.map +1 -0
- package/dist/common/models/legacy.d.ts +14 -0
- package/dist/common/models/legacy.d.ts.map +1 -0
- package/dist/common/models/legacy.js +52 -0
- package/dist/common/models/legacy.js.map +1 -0
- package/dist/common/models/v5.d.ts +13 -0
- package/dist/common/models/v5.d.ts.map +1 -0
- package/dist/common/models/v5.js +45 -0
- package/dist/common/models/v5.js.map +1 -0
- package/dist/common/models/v6.d.ts +22 -0
- package/dist/common/models/v6.d.ts.map +1 -0
- package/dist/common/models/v6.js +54 -0
- package/dist/common/models/v6.js.map +1 -0
- package/dist/common/non-real-time-vad.d.ts +23 -0
- package/dist/common/non-real-time-vad.d.ts.map +1 -0
- package/dist/common/non-real-time-vad.js +127 -0
- package/dist/common/non-real-time-vad.js.map +1 -0
- package/dist/common/resampler.d.ts +16 -0
- package/dist/common/resampler.d.ts.map +1 -0
- package/dist/common/resampler.js +63 -0
- package/dist/common/resampler.js.map +1 -0
- package/dist/common/utils.d.ts +8 -0
- package/dist/common/utils.d.ts.map +1 -0
- package/dist/common/utils.js +118 -0
- package/dist/common/utils.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +74 -0
- package/dist/index.js.map +1 -0
- package/dist/real-time-vad.d.ts +63 -0
- package/dist/real-time-vad.d.ts.map +1 -0
- package/dist/real-time-vad.js +171 -0
- package/dist/real-time-vad.js.map +1 -0
- package/dist/silero_vad_legacy.onnx +0 -0
- package/dist/silero_vad_v5.onnx +0 -0
- package/package.json +66 -0
- package/silero_vad_legacy.onnx +0 -0
- package/silero_vad_v5.onnx +0 -0
- package/silero_vad_v6.onnx +0 -0
package/LICENSE.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Agent Voice Response
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
# Agent Voice Response - AVR VAD - Silero Voice Activity Detection for Node.js
|
|
2
|
+
|
|
3
|
+
[](https://discord.gg/DFTU69Hg74)
|
|
4
|
+
[](https://github.com/agentvoiceresponse/avr-vad)
|
|
5
|
+
[](https://www.npmjs.com/package/avr-vad)
|
|
6
|
+
[](https://www.npmjs.com/package/avr-vad)
|
|
7
|
+
[](https://ko-fi.com/agentvoiceresponse)
|
|
8
|
+
|
|
9
|
+
🎤 A Node.js library for Voice Activity Detection using the Silero VAD model.
|
|
10
|
+
|
|
11
|
+
## ✨ Features
|
|
12
|
+
|
|
13
|
+
- 🚀 **Based on Silero VAD**: Uses the pre-trained Silero ONNX model (v5 and legacy versions) for accurate results
|
|
14
|
+
- 🎯 **Real-time processing**: Supports real-time frame-by-frame processing
|
|
15
|
+
- ⚡ **Non-real-time processing**: Batch processing for audio files and streams
|
|
16
|
+
- 🔧 **Configurable**: Customizable thresholds and parameters for different needs
|
|
17
|
+
- 🎵 **Audio processing**: Includes utilities for resampling and audio manipulation
|
|
18
|
+
- 📊 **Multiple models**: Support for both Silero VAD v5 and legacy models
|
|
19
|
+
- 💾 **Bundled models**: Models are included in the package, no external downloads required
|
|
20
|
+
- 📝 **TypeScript**: Fully typed with TypeScript
|
|
21
|
+
|
|
22
|
+
## 🚀 Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
npm install avr-vad
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## 📖 Quick Start
|
|
29
|
+
|
|
30
|
+
### Real-time Processing
|
|
31
|
+
|
|
32
|
+
```typescript
|
|
33
|
+
import { RealTimeVAD } from 'avr-vad';
|
|
34
|
+
|
|
35
|
+
// Initialize the VAD with default options (Silero v5 model)
|
|
36
|
+
const vad = await RealTimeVAD.new({
|
|
37
|
+
model: 'v5', // or 'legacy'
|
|
38
|
+
positiveSpeechThreshold: 0.5,
|
|
39
|
+
negativeSpeechThreshold: 0.35,
|
|
40
|
+
preSpeechPadFrames: 1,
|
|
41
|
+
redemptionFrames: 8,
|
|
42
|
+
frameSamples: 1536,
|
|
43
|
+
minSpeechFrames: 3
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
// Process audio frames in real-time
|
|
47
|
+
const audioFrame = getAudioFrameFromMicrophone(); // Float32Array of 1536 samples at 16kHz
|
|
48
|
+
const result = await vad.processFrame(audioFrame);
|
|
49
|
+
|
|
50
|
+
console.log(`Speech probability: ${result.probability}`);
|
|
51
|
+
console.log(`Speech detected: ${result.msg === 'SPEECH_START' || result.msg === 'SPEECH_CONTINUE'}`);
|
|
52
|
+
|
|
53
|
+
// Clean up when done
|
|
54
|
+
vad.destroy();
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Non-Real-time Processing
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
import { NonRealTimeVAD } from 'avr-vad';
|
|
61
|
+
|
|
62
|
+
// Initialize for batch processing
|
|
63
|
+
const vad = await NonRealTimeVAD.new({
|
|
64
|
+
model: 'v5',
|
|
65
|
+
positiveSpeechThreshold: 0.5,
|
|
66
|
+
negativeSpeechThreshold: 0.35
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// Process entire audio buffer
|
|
70
|
+
const audioData = loadAudioData(); // Float32Array at 16kHz
|
|
71
|
+
const results = await vad.processAudio(audioData);
|
|
72
|
+
|
|
73
|
+
// Get speech segments
|
|
74
|
+
const speechSegments = vad.getSpeechSegments(results);
|
|
75
|
+
console.log(`Found ${speechSegments.length} speech segments`);
|
|
76
|
+
|
|
77
|
+
speechSegments.forEach((segment, i) => {
|
|
78
|
+
console.log(`Segment ${i + 1}: ${segment.start}ms - ${segment.end}ms`);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
// Clean up
|
|
82
|
+
vad.destroy();
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## ⚙️ Configuration
|
|
86
|
+
|
|
87
|
+
### Real-time VAD Options
|
|
88
|
+
|
|
89
|
+
```typescript
|
|
90
|
+
interface RealTimeVADOptions {
|
|
91
|
+
/** Model version to use ('v5' | 'legacy') */
|
|
92
|
+
model?: 'v5' | 'legacy';
|
|
93
|
+
|
|
94
|
+
/** Threshold for detecting speech start */
|
|
95
|
+
positiveSpeechThreshold?: number;
|
|
96
|
+
|
|
97
|
+
/** Threshold for detecting speech end */
|
|
98
|
+
negativeSpeechThreshold?: number;
|
|
99
|
+
|
|
100
|
+
/** Frames to include before speech detection */
|
|
101
|
+
preSpeechPadFrames?: number;
|
|
102
|
+
|
|
103
|
+
/** Frames to wait before ending speech */
|
|
104
|
+
redemptionFrames?: number;
|
|
105
|
+
|
|
106
|
+
/** Number of samples per frame (usually 1536 for 16kHz) */
|
|
107
|
+
frameSamples?: number;
|
|
108
|
+
|
|
109
|
+
/** Minimum frames for valid speech */
|
|
110
|
+
minSpeechFrames?: number;
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Non-Real-time VAD Options
|
|
115
|
+
|
|
116
|
+
```typescript
|
|
117
|
+
interface NonRealTimeVADOptions {
|
|
118
|
+
/** Model version to use ('v5' | 'legacy') */
|
|
119
|
+
model?: 'v5' | 'legacy';
|
|
120
|
+
|
|
121
|
+
/** Threshold for detecting speech start */
|
|
122
|
+
positiveSpeechThreshold?: number;
|
|
123
|
+
|
|
124
|
+
/** Threshold for detecting speech end */
|
|
125
|
+
negativeSpeechThreshold?: number;
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Default Values
|
|
130
|
+
|
|
131
|
+
```typescript
|
|
132
|
+
// Real-time VAD defaults
|
|
133
|
+
const defaultRealTimeOptions = {
|
|
134
|
+
model: 'v5',
|
|
135
|
+
positiveSpeechThreshold: 0.5,
|
|
136
|
+
negativeSpeechThreshold: 0.35,
|
|
137
|
+
preSpeechPadFrames: 1,
|
|
138
|
+
redemptionFrames: 8,
|
|
139
|
+
frameSamples: 1536,
|
|
140
|
+
minSpeechFrames: 3
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
// Non-real-time VAD defaults
|
|
144
|
+
const defaultNonRealTimeOptions = {
|
|
145
|
+
model: 'v5',
|
|
146
|
+
positiveSpeechThreshold: 0.5,
|
|
147
|
+
negativeSpeechThreshold: 0.35
|
|
148
|
+
};
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## 📊 Results and Messages
|
|
152
|
+
|
|
153
|
+
### VAD Messages
|
|
154
|
+
|
|
155
|
+
The VAD returns different message types to indicate speech state changes:
|
|
156
|
+
|
|
157
|
+
```typescript
|
|
158
|
+
enum Message {
|
|
159
|
+
ERROR = 'ERROR',
|
|
160
|
+
SPEECH_START = 'SPEECH_START',
|
|
161
|
+
SPEECH_CONTINUE = 'SPEECH_CONTINUE',
|
|
162
|
+
SPEECH_END = 'SPEECH_END',
|
|
163
|
+
SILENCE = 'SILENCE'
|
|
164
|
+
}
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Processing Results
|
|
168
|
+
|
|
169
|
+
```typescript
|
|
170
|
+
interface VADResult {
|
|
171
|
+
/** Speech probability (0.0 - 1.0) */
|
|
172
|
+
probability: number;
|
|
173
|
+
|
|
174
|
+
/** Message indicating speech state */
|
|
175
|
+
msg: Message;
|
|
176
|
+
|
|
177
|
+
/** Audio data if speech segment ended */
|
|
178
|
+
audio?: Float32Array;
|
|
179
|
+
}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Speech Segments
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
interface SpeechSegment {
|
|
186
|
+
/** Start time in milliseconds */
|
|
187
|
+
start: number;
|
|
188
|
+
|
|
189
|
+
/** End time in milliseconds */
|
|
190
|
+
end: number;
|
|
191
|
+
|
|
192
|
+
/** Speech probability for this segment */
|
|
193
|
+
probability: number;
|
|
194
|
+
}
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## 🔧 Audio Utilities
|
|
198
|
+
|
|
199
|
+
The library includes various audio processing utilities:
|
|
200
|
+
|
|
201
|
+
```typescript
|
|
202
|
+
import { utils, Resampler } from 'avr-vad';
|
|
203
|
+
|
|
204
|
+
// Resample audio to 16kHz (required for VAD)
|
|
205
|
+
const resampler = new Resampler({
|
|
206
|
+
nativeSampleRate: 44100,
|
|
207
|
+
targetSampleRate: 16000,
|
|
208
|
+
targetFrameSize: 1536
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
const resampledFrame = resampler.process(audioFrame);
|
|
212
|
+
|
|
213
|
+
// Other utilities
|
|
214
|
+
const frameSize = utils.frameSize; // Get frame size for current sample rate
|
|
215
|
+
const audioBuffer = utils.concatArrays([frame1, frame2]); // Concatenate audio arrays
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## 🎯 Advanced Examples
|
|
219
|
+
|
|
220
|
+
### Real-time Speech Detection with Callbacks
|
|
221
|
+
|
|
222
|
+
```typescript
|
|
223
|
+
import { RealTimeVAD, Message } from 'avr-vad';
|
|
224
|
+
|
|
225
|
+
class SpeechDetector {
|
|
226
|
+
private vad: RealTimeVAD;
|
|
227
|
+
private onSpeechStart?: (audio: Float32Array) => void;
|
|
228
|
+
private onSpeechEnd?: (audio: Float32Array) => void;
|
|
229
|
+
|
|
230
|
+
constructor(callbacks: {
|
|
231
|
+
onSpeechStart?: (audio: Float32Array) => void;
|
|
232
|
+
onSpeechEnd?: (audio: Float32Array) => void;
|
|
233
|
+
}) {
|
|
234
|
+
this.onSpeechStart = callbacks.onSpeechStart;
|
|
235
|
+
this.onSpeechEnd = callbacks.onSpeechEnd;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
async initialize() {
|
|
239
|
+
this.vad = await RealTimeVAD.new({
|
|
240
|
+
positiveSpeechThreshold: 0.5,
|
|
241
|
+
negativeSpeechThreshold: 0.35
|
|
242
|
+
onSpeechStart: this.onSpeechStart,
|
|
243
|
+
onSpeechEnd: this.onSpeechEnd
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
async processFrame(audioFrame: Float32Array) {
|
|
248
|
+
const result = await this.vad.processFrame(audioFrame);
|
|
249
|
+
return result;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
destroy() {
|
|
253
|
+
this.vad?.destroy();
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Usage
|
|
258
|
+
const detector = new SpeechDetector({
|
|
259
|
+
onSpeechStart: (audio) => console.log(`Speech started with ${audio.length} samples`),
|
|
260
|
+
onSpeechEnd: (audio) => console.log(`Speech ended with ${audio.length} samples`)
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
await detector.initialize();
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Batch Processing Audio File
|
|
267
|
+
|
|
268
|
+
```typescript
|
|
269
|
+
import { NonRealTimeVAD, utils } from 'avr-vad';
|
|
270
|
+
import * as fs from 'fs';
|
|
271
|
+
|
|
272
|
+
async function processAudioFile(filePath: string) {
|
|
273
|
+
// Load audio data (you'll need your own audio loading logic)
|
|
274
|
+
const audioData = loadWavFile(filePath); // Float32Array at 16kHz
|
|
275
|
+
|
|
276
|
+
const vad = await NonRealTimeVAD.new({
|
|
277
|
+
model: 'v5',
|
|
278
|
+
positiveSpeechThreshold: 0.6,
|
|
279
|
+
negativeSpeechThreshold: 0.4
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
const results = await vad.processAudio(audioData);
|
|
283
|
+
const segments = vad.getSpeechSegments(results);
|
|
284
|
+
|
|
285
|
+
console.log(`Processing ${filePath}:`);
|
|
286
|
+
console.log(`Total audio duration: ${(audioData.length / 16000).toFixed(2)}s`);
|
|
287
|
+
console.log(`Speech segments found: ${segments.length}`);
|
|
288
|
+
|
|
289
|
+
segments.forEach((segment, i) => {
|
|
290
|
+
const duration = ((segment.end - segment.start) / 1000).toFixed(2);
|
|
291
|
+
console.log(` Segment ${i + 1}: ${segment.start}ms - ${segment.end}ms (${duration}s)`);
|
|
292
|
+
});
|
|
293
|
+
|
|
294
|
+
vad.destroy();
|
|
295
|
+
return segments;
|
|
296
|
+
}
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
## 📝 Development
|
|
300
|
+
|
|
301
|
+
### Requirements
|
|
302
|
+
|
|
303
|
+
- Node.js >= 16.0.0
|
|
304
|
+
- TypeScript >= 5.0.0
|
|
305
|
+
|
|
306
|
+
### Build
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
npm run build
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### Test
|
|
313
|
+
|
|
314
|
+
```bash
|
|
315
|
+
npm test
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### Scripts
|
|
319
|
+
|
|
320
|
+
```bash
|
|
321
|
+
npm run lint # Run ESLint
|
|
322
|
+
npm run clean # Clean build directory
|
|
323
|
+
npm run prepare # Build before npm install (automatically run)
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
## 📁 Project Structure
|
|
327
|
+
|
|
328
|
+
```
|
|
329
|
+
avr-vad/
|
|
330
|
+
├── src/
|
|
331
|
+
│ ├── index.ts # Main exports
|
|
332
|
+
│ ├── real-time-vad.ts # Real-time VAD implementation
|
|
333
|
+
│ └── common/
|
|
334
|
+
│ ├── index.ts # Common exports
|
|
335
|
+
│ ├── frame-processor.ts # Core ONNX processing
|
|
336
|
+
│ ├── non-real-time-vad.ts # Batch processing VAD
|
|
337
|
+
│ ├── utils.ts # Utility functions
|
|
338
|
+
│ ├── resampler.ts # Audio resampling
|
|
339
|
+
├── dist/ # Compiled JavaScript
|
|
340
|
+
├── test/ # Test files
|
|
341
|
+
├── silero_vad_v5.onnx # Silero VAD v5 model
|
|
342
|
+
├── silero_vad_legacy.onnx # Silero VAD legacy model
|
|
343
|
+
└── package.json
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
## 🔧 Troubleshooting
|
|
347
|
+
|
|
348
|
+
### Audio Format Requirements
|
|
349
|
+
|
|
350
|
+
The Silero VAD model requires:
|
|
351
|
+
- **Sample rate**: 16kHz
|
|
352
|
+
- **Channels**: Mono (single channel)
|
|
353
|
+
- **Format**: Float32Array with values between -1.0 and 1.0
|
|
354
|
+
- **Frame size**: 1536 samples (96ms at 16kHz)
|
|
355
|
+
|
|
356
|
+
### Model Selection
|
|
357
|
+
|
|
358
|
+
- **v5 model**: Latest version with improved accuracy
|
|
359
|
+
- **legacy model**: Original model for compatibility
|
|
360
|
+
|
|
361
|
+
Use the `Resampler` utility to convert audio to the required format:
|
|
362
|
+
|
|
363
|
+
```typescript
|
|
364
|
+
import { Resampler } from 'avr-vad';
|
|
365
|
+
|
|
366
|
+
const resampler = new Resampler({
|
|
367
|
+
nativeSampleRate: 44100, // Your audio sample rate
|
|
368
|
+
targetSampleRate: 16000, // Required by VAD
|
|
369
|
+
targetFrameSize: 1536 // Required frame size
|
|
370
|
+
});
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Performance Tips
|
|
374
|
+
|
|
375
|
+
- Use appropriate thresholds for your use case
|
|
376
|
+
- Consider using the legacy model for lower resource usage
|
|
377
|
+
- For real-time applications, ensure your audio processing pipeline can handle 16kHz/1536 samples per frame
|
|
378
|
+
- Use `redemptionFrames` to avoid choppy speech detection
|
|
379
|
+
|
|
380
|
+
## Acknowledgments
|
|
381
|
+
|
|
382
|
+
- [Silero Models](https://github.com/snakers4/silero-vad) for the excellent VAD model
|
|
383
|
+
- [ONNX Runtime](https://onnxruntime.ai/) for model inference
|
|
384
|
+
- The open source community for supporting libraries
|
|
385
|
+
|
|
386
|
+
## Support & Community
|
|
387
|
+
|
|
388
|
+
* **Website:** [https://agentvoiceresponse.com](https://agentvoiceresponse.com) - Official website.
|
|
389
|
+
* **GitHub:** [https://github.com/agentvoiceresponse](https://github.com/agentvoiceresponse) - Report issues, contribute code.
|
|
390
|
+
* **Discord:** [https://discord.gg/DFTU69Hg74](https://discord.gg/DFTU69Hg74) - Join the community discussion.
|
|
391
|
+
* **Docker Hub:** [https://hub.docker.com/u/agentvoiceresponse](https://hub.docker.com/u/agentvoiceresponse) - Find Docker images.
|
|
392
|
+
* **NPM:** [https://www.npmjs.com/~agentvoiceresponse](https://www.npmjs.com/~agentvoiceresponse) - Browse our packages.
|
|
393
|
+
* **Wiki:** [https://wiki.agentvoiceresponse.com/en/home](https://wiki.agentvoiceresponse.com/en/home) - Project documentation and guides.
|
|
394
|
+
|
|
395
|
+
## Support AVR
|
|
396
|
+
|
|
397
|
+
AVR is free and open-source. If you find it valuable, consider supporting its development:
|
|
398
|
+
|
|
399
|
+
<a href="https://ko-fi.com/agentvoiceresponse" target="_blank"><img src="https://ko-fi.com/img/githubbutton_sm.svg" alt="Support us on Ko-fi"></a>
|
|
400
|
+
|
|
401
|
+
## License
|
|
402
|
+
|
|
403
|
+
MIT License - see the [LICENSE.md](LICENSE.md) file for details.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"asset-path.d.ts","sourceRoot":"","sources":["../../src/common/asset-path.ts"],"names":[],"mappings":"AAeA,eAAO,MAAM,aAAa,QAAW,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.baseAssetPath = void 0;
|
|
4
|
+
// nextjs@14 bundler may attempt to execute this during SSR and crash
|
|
5
|
+
const isWeb = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
6
|
+
const currentScript = isWeb
|
|
7
|
+
? window.document.currentScript
|
|
8
|
+
: null;
|
|
9
|
+
let basePath = "/";
|
|
10
|
+
if (currentScript) {
|
|
11
|
+
basePath = currentScript.src
|
|
12
|
+
.replace(/#.*$/, "")
|
|
13
|
+
.replace(/\?.*$/, "")
|
|
14
|
+
.replace(/\/[^/]+$/, "/");
|
|
15
|
+
}
|
|
16
|
+
exports.baseAssetPath = basePath;
|
|
17
|
+
//# sourceMappingURL=asset-path.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"asset-path.js","sourceRoot":"","sources":["../../src/common/asset-path.ts"],"names":[],"mappings":";;;AAAA,qEAAqE;AACrE,MAAM,KAAK,GACV,OAAO,MAAM,KAAK,WAAW,IAAI,OAAO,MAAM,CAAC,QAAQ,KAAK,WAAW,CAAC;AACzE,MAAM,aAAa,GAAG,KAAK;IAC1B,CAAC,CAAE,MAAM,CAAC,QAAQ,CAAC,aAAmC;IACtD,CAAC,CAAC,IAAI,CAAC;AAER,IAAI,QAAQ,GAAG,GAAG,CAAC;AACnB,IAAI,aAAa,EAAE,CAAC;IACnB,QAAQ,GAAG,aAAa,CAAC,GAAG;SAC1B,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;SACnB,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;SACpB,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC;AAC5B,CAAC;AAEY,QAAA,aAAa,GAAG,QAAQ,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"default-model-fetcher.d.ts","sourceRoot":"","sources":["../../src/common/default-model-fetcher.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,mBAAmB,GAAI,MAAM,MAAM,yBAE/C,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.defaultModelFetcher = void 0;
|
|
4
|
+
const defaultModelFetcher = (path) => {
|
|
5
|
+
return fetch(path).then((model) => model.arrayBuffer());
|
|
6
|
+
};
|
|
7
|
+
exports.defaultModelFetcher = defaultModelFetcher;
|
|
8
|
+
//# sourceMappingURL=default-model-fetcher.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"default-model-fetcher.js","sourceRoot":"","sources":["../../src/common/default-model-fetcher.ts"],"names":[],"mappings":";;;AAAO,MAAM,mBAAmB,GAAG,CAAC,IAAY,EAAE,EAAE;IAClD,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AAFW,QAAA,mBAAmB,uBAE9B"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { Message } from "./messages";
|
|
2
|
+
import type { SpeechProbabilities } from "./models";
|
|
3
|
+
export interface FrameProcessorOptions {
|
|
4
|
+
/** Threshold over which values returned by the Silero VAD model will be considered as positively indicating speech.
|
|
5
|
+
* The Silero VAD model is run on each frame. This number should be between 0 and 1.
|
|
6
|
+
*/
|
|
7
|
+
positiveSpeechThreshold: number;
|
|
8
|
+
/** Threshold under which values returned by the Silero VAD model will be considered as indicating an absence of speech.
|
|
9
|
+
* Note that the creators of the Silero VAD have historically set this number at 0.15 less than `positiveSpeechThreshold`.
|
|
10
|
+
*/
|
|
11
|
+
negativeSpeechThreshold: number;
|
|
12
|
+
/** After a VAD value under the `negativeSpeechThreshold` is observed, the algorithm will wait `redemptionFrames` frames
|
|
13
|
+
* before running `onSpeechEnd`. If the model returns a value over `positiveSpeechThreshold` during this grace period, then
|
|
14
|
+
* the algorithm will consider the previously-detected "speech end" as having been a false negative.
|
|
15
|
+
*/
|
|
16
|
+
redemptionFrames: number;
|
|
17
|
+
/** Number of audio samples (under a sample rate of 16000) to comprise one "frame" to feed to the Silero VAD model.
|
|
18
|
+
* The `frame` serves as a unit of measurement of lengths of audio segments and many other parameters are defined in terms of
|
|
19
|
+
* frames. The authors of the Silero VAD model offer the following warning:
|
|
20
|
+
* > WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples for 8000 sample rate.
|
|
21
|
+
* > Values other than these may affect model perfomance!!
|
|
22
|
+
* In this context, audio fed to the VAD model always has sample rate 16000. It is probably a good idea to leave this at 1536.
|
|
23
|
+
*/
|
|
24
|
+
frameSamples: number;
|
|
25
|
+
/** Number of frames to prepend to the audio segment that will be passed to `onSpeechEnd`. */
|
|
26
|
+
preSpeechPadFrames: number;
|
|
27
|
+
/** If an audio segment is detected as a speech segment according to initial algorithm but it has fewer than `minSpeechFrames`,
|
|
28
|
+
* it will be discarded and `onVADMisfire` will be run instead of `onSpeechEnd`.
|
|
29
|
+
*/
|
|
30
|
+
minSpeechFrames: number;
|
|
31
|
+
/**
|
|
32
|
+
* If true, when the user pauses the VAD, it may trigger `onSpeechEnd`.
|
|
33
|
+
*/
|
|
34
|
+
submitUserSpeechOnPause: boolean;
|
|
35
|
+
}
|
|
36
|
+
export declare const defaultLegacyFrameProcessorOptions: FrameProcessorOptions;
|
|
37
|
+
export declare const defaultV5FrameProcessorOptions: FrameProcessorOptions;
|
|
38
|
+
/**
|
|
39
|
+
* Default options for Silero VAD v6.2
|
|
40
|
+
* Optimized for telephony and lower quality audio
|
|
41
|
+
*/
|
|
42
|
+
export declare const defaultV6FrameProcessorOptions: FrameProcessorOptions;
|
|
43
|
+
export declare function validateOptions(options: FrameProcessorOptions): void;
|
|
44
|
+
export interface FrameProcessorInterface {
|
|
45
|
+
resume: () => void;
|
|
46
|
+
process: (arr: Float32Array, handleEvent: (event: FrameProcessorEvent) => any) => Promise<any>;
|
|
47
|
+
endSegment: (handleEvent: (event: FrameProcessorEvent) => any) => {
|
|
48
|
+
msg?: Message;
|
|
49
|
+
audio?: Float32Array;
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
export declare class FrameProcessor implements FrameProcessorInterface {
|
|
53
|
+
modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>;
|
|
54
|
+
modelResetFunc: () => any;
|
|
55
|
+
options: FrameProcessorOptions;
|
|
56
|
+
speaking: boolean;
|
|
57
|
+
audioBuffer: {
|
|
58
|
+
frame: Float32Array;
|
|
59
|
+
isSpeech: boolean;
|
|
60
|
+
}[];
|
|
61
|
+
redemptionCounter: number;
|
|
62
|
+
speechFrameCount: number;
|
|
63
|
+
active: boolean;
|
|
64
|
+
speechRealStartFired: boolean;
|
|
65
|
+
constructor(modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>, modelResetFunc: () => any, options: FrameProcessorOptions);
|
|
66
|
+
reset: () => void;
|
|
67
|
+
pause: (handleEvent: (event: FrameProcessorEvent) => any) => void;
|
|
68
|
+
resume: () => void;
|
|
69
|
+
endSegment: (handleEvent: (event: FrameProcessorEvent) => any) => {};
|
|
70
|
+
process: (frame: Float32Array, handleEvent: (event: FrameProcessorEvent) => any) => Promise<void>;
|
|
71
|
+
}
|
|
72
|
+
export type FrameProcessorEvent = {
|
|
73
|
+
msg: Message.VADMisfire;
|
|
74
|
+
} | {
|
|
75
|
+
msg: Message.SpeechStart;
|
|
76
|
+
} | {
|
|
77
|
+
msg: Message.SpeechRealStart;
|
|
78
|
+
} | {
|
|
79
|
+
msg: Message.SpeechEnd;
|
|
80
|
+
audio: Float32Array;
|
|
81
|
+
} | {
|
|
82
|
+
msg: Message.FrameProcessed;
|
|
83
|
+
probs: SpeechProbabilities;
|
|
84
|
+
frame: Float32Array;
|
|
85
|
+
};
|
|
86
|
+
//# sourceMappingURL=frame-processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"frame-processor.d.ts","sourceRoot":"","sources":["../../src/common/frame-processor.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AACrC,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,UAAU,CAAC;AAIpD,MAAM,WAAW,qBAAqB;IACpC;;OAEG;IACH,uBAAuB,EAAE,MAAM,CAAC;IAEhC;;OAEG;IACH,uBAAuB,EAAE,MAAM,CAAC;IAEhC;;;OAGG;IACH,gBAAgB,EAAE,MAAM,CAAC;IAEzB;;;;;;OAMG;IACH,YAAY,EAAE,MAAM,CAAC;IAErB,6FAA6F;IAC7F,kBAAkB,EAAE,MAAM,CAAC;IAE3B;;OAEG;IACH,eAAe,EAAE,MAAM,CAAC;IAExB;;OAEG;IACH,uBAAuB,EAAE,OAAO,CAAC;CAClC;AAED,eAAO,MAAM,kCAAkC,EAAE,qBAQhD,CAAC;AAEF,eAAO,MAAM,8BAA8B,EAAE,qBAQ5C,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,8BAA8B,EAAE,qBAQ5C,CAAC;AAEF,wBAAgB,eAAe,CAAC,OAAO,EAAE,qBAAqB,QAwB7D;AAED,MAAM,WAAW,uBAAuB;IACtC,MAAM,EAAE,MAAM,IAAI,CAAC;IACnB,OAAO,EAAE,CACP,GAAG,EAAE,YAAY,EACjB,WAAW,EAAE,CAAC,KAAK,EAAE,mBAAmB,KAAK,GAAG,KAC7C,OAAO,CAAC,GAAG,CAAC,CAAC;IAClB,UAAU,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,EAAE,mBAAmB,KAAK,GAAG,KAAK;QAChE,GAAG,CAAC,EAAE,OAAO,CAAC;QACd,KAAK,CAAC,EAAE,YAAY,CAAC;KACtB,CAAC;CACH;AAkBD,qBAAa,cAAe,YAAW,uBAAuB;IASnD,gBAAgB,EAAE,CACvB,KAAK,EAAE,YAAY,KAChB,OAAO,CAAC,mBAAmB,CAAC;IAC1B,cAAc,EAAE,MAAM,GAAG;IACzB,OAAO,EAAE,qBAAqB;IAZvC,QAAQ,UAAS;IACjB,WAAW,EAAE;QAAE,KAAK,EAAE,YAAY,CAAC;QAAC,QAAQ,EAAE,OAAO,CAAA;KAAE,EAAE,CAAC;IAC1D,iBAAiB,SAAK;IACtB,gBAAgB,SAAK;IACrB,MAAM,UAAS;IACf,oBAAoB,UAAS;gBAGpB,gBAAgB,EAAE,CACvB,KAAK,EAAE,YAAY,KAChB,OAAO,CAAC,mBAAmB,CAAC,EAC1B,cAAc,EAAE,MAAM,GAAG,EACzB,OAAO,EAAE,qBAAqB;IAMvC,KAAK,aAOH;IAEF,KAAK,GAAI,aAAa,CAAC,KAAK,EAAE,mBAAmB,KAAK,GAAG,UAOvD;IAEF,MAAM,aAEJ;IAEF,UAAU,GAAI,aAAa,CAAC,KAAK,EAAE,mBAAmB,KAAK,GAAG,QAkB5D;IAEF,OAAO,GACL,OAAO,YAAY,EACnB,aAAa,CAAC,KAAK,EAAE,mBAAmB,KAAK,GAAG,mBAiEhD;CACH;AAED,MAAM,MAAM,mBAAmB,GAC3B;IACE,GAAG,EAAE,OAAO,CAAC,UAAU,CAAC;CACzB,GACD;IACE,GAAG,EAAE,OAAO,CAAC,WAAW,CAAC;CAC1B,GACD;IACE,GAAG,EAAE,OAAO,CAAC,eAAe,CAAC;CAC9B,GACD;IACE,GAAG,EAAE,OAAO,CAAC,SAAS,CAAC;IACvB,KAAK,EAAE,YAAY,CAAC;CACrB,GACD;IACE,GAAG,EAAE,OAAO,CAAC,cAAc,CAAC;IAC5B,KAAK,EAAE,mBAAmB,CAAC;IAC3B,KAAK,EAAE,YAAY,CAAC;CACrB,CAAC"}
|