@livekit/agents-plugin-silero 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitattributes +5 -0
- package/.turbo/turbo-build.log +4 -0
- package/CHANGELOG.md +12 -0
- package/LICENSE +201 -0
- package/api-extractor.json +20 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/onnx_model.d.ts +13 -0
- package/dist/onnx_model.d.ts.map +1 -0
- package/dist/onnx_model.js +69 -0
- package/dist/onnx_model.js.map +1 -0
- package/dist/silero_vad.onnx +3 -0
- package/dist/vad.d.ts +58 -0
- package/dist/vad.d.ts.map +1 -0
- package/dist/vad.js +240 -0
- package/dist/vad.js.map +1 -0
- package/package.json +29 -0
- package/src/index.ts +4 -0
- package/src/onnx_model.ts +80 -0
- package/src/onnxruntime.d.ts +8 -0
- package/src/silero_vad.onnx +3 -0
- package/src/vad.ts +328 -0
- package/tsconfig.json +15 -0
- package/tsconfig.tsbuildinfo +1 -0
package/dist/vad.js
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { ExpFilter, VADEventType, VADStream as baseStream, VAD as baseVAD, log, mergeFrames, } from '@livekit/agents';
|
|
5
|
+
import { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';
|
|
6
|
+
import { OnnxModel, newInferenceSession } from './onnx_model.js';
|
|
7
|
+
const SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms
|
|
8
|
+
const defaultVADOptions = {
|
|
9
|
+
minSpeechDuration: 50,
|
|
10
|
+
minSilenceDuration: 250,
|
|
11
|
+
prefixPaddingDuration: 100,
|
|
12
|
+
maxBufferedSpeech: 60000,
|
|
13
|
+
activationThreshold: 0.5,
|
|
14
|
+
sampleRate: 16000,
|
|
15
|
+
forceCPU: true,
|
|
16
|
+
};
|
|
17
|
+
export class VAD extends baseVAD {
|
|
18
|
+
#session;
|
|
19
|
+
#opts;
|
|
20
|
+
constructor(session, opts) {
|
|
21
|
+
super({ updateInterval: 32 });
|
|
22
|
+
this.#session = session;
|
|
23
|
+
this.#opts = opts;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Load and initialize the Silero VAD model.
|
|
27
|
+
*
|
|
28
|
+
* This method loads the ONNX model and prepares it for inference. When options are not provided,
|
|
29
|
+
* sane defaults are used.
|
|
30
|
+
*
|
|
31
|
+
* @remarks
|
|
32
|
+
* This method may take time to load the model into memory.
|
|
33
|
+
* It is recommended to call this method inside your prewarm mechanism.
|
|
34
|
+
*
|
|
35
|
+
* @example
|
|
36
|
+
* ```ts
|
|
37
|
+
* export default defineAgent({
|
|
38
|
+
* prewarm: async (proc: JobProcess) => {
|
|
39
|
+
* proc.userData.vad = await VAD.load();
|
|
40
|
+
* },
|
|
41
|
+
* entry: async (ctx: JobContext) => {
|
|
42
|
+
* const vad = ctx.proc.userData.vad! as VAD;
|
|
43
|
+
* // the rest of your agent logic
|
|
44
|
+
* },
|
|
45
|
+
* });
|
|
46
|
+
* ```
|
|
47
|
+
*
|
|
48
|
+
* @param options -
|
|
49
|
+
* @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming.
|
|
50
|
+
*/
|
|
51
|
+
static async load(opts = defaultVADOptions) {
|
|
52
|
+
const session = await newInferenceSession(opts.forceCPU);
|
|
53
|
+
return new VAD(session, opts);
|
|
54
|
+
}
|
|
55
|
+
stream() {
|
|
56
|
+
return new VADStream(this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
export class VADStream extends baseStream {
|
|
60
|
+
#opts;
|
|
61
|
+
#model;
|
|
62
|
+
#task;
|
|
63
|
+
#expFilter = new ExpFilter(0.35);
|
|
64
|
+
#extraInferenceTime = 0;
|
|
65
|
+
#logger = log();
|
|
66
|
+
constructor(opts, model) {
|
|
67
|
+
super();
|
|
68
|
+
this.#opts = opts;
|
|
69
|
+
this.#model = model;
|
|
70
|
+
this.#task = new Promise(async () => {
|
|
71
|
+
let inferenceData = new Float32Array(this.#model.windowSizeSamples);
|
|
72
|
+
// a copy is exposed to the user in END_OF_SPEECH
|
|
73
|
+
let speechBuffer = null;
|
|
74
|
+
let speechBufferMaxReached = false;
|
|
75
|
+
let speechBufferIndex = 0;
|
|
76
|
+
// "pub" means public, these values are exposed to the users through events
|
|
77
|
+
let pubSpeaking = false;
|
|
78
|
+
let pubSpeechDuration = 0;
|
|
79
|
+
let pubSilenceDuration = 0;
|
|
80
|
+
let pubCurrentSample = 0;
|
|
81
|
+
let pubTimestamp = 0;
|
|
82
|
+
let pubSampleRate = 0;
|
|
83
|
+
let pubPrefixPaddingSamples = 0; // size in samples of padding data
|
|
84
|
+
let speechThresholdDuration = 0;
|
|
85
|
+
let silenceThresholdDuration = 0;
|
|
86
|
+
let inputFrames = [];
|
|
87
|
+
let inferenceFrames = [];
|
|
88
|
+
let resampler = null;
|
|
89
|
+
// used to avoid drift when the sampleRate ratio is not an integer
|
|
90
|
+
let inputCopyRemainingFrac = 0.0;
|
|
91
|
+
for await (const frame of this.input) {
|
|
92
|
+
if (typeof frame === 'symbol') {
|
|
93
|
+
continue; // ignore flush sentinel for now
|
|
94
|
+
}
|
|
95
|
+
if (!pubSampleRate || !speechBuffer) {
|
|
96
|
+
pubSampleRate = frame.sampleRate;
|
|
97
|
+
pubPrefixPaddingSamples = Math.ceil(this.#opts.prefixPaddingDuration * pubSampleRate);
|
|
98
|
+
speechBuffer = new Int16Array((this.#opts.maxBufferedSpeech + this.#opts.prefixPaddingDuration) * pubSampleRate);
|
|
99
|
+
if (this.#opts.sampleRate !== pubSampleRate) {
|
|
100
|
+
// resampling needed: the input sample rate isn't the same as the model's
|
|
101
|
+
// sample rate used for inference
|
|
102
|
+
resampler = new AudioResampler(pubSampleRate, this.#opts.sampleRate, 1, AudioResamplerQuality.QUICK);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
else if (frame.sampleRate !== pubSampleRate) {
|
|
106
|
+
this.#logger.error('a frame with a different sample rate was already published');
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
inputFrames.push(frame);
|
|
110
|
+
if (resampler) {
|
|
111
|
+
inferenceFrames.push(...resampler.push(frame));
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
inferenceFrames.push(frame);
|
|
115
|
+
}
|
|
116
|
+
while (true) {
|
|
117
|
+
const startTime = process.hrtime.bigint();
|
|
118
|
+
const availableInferenceSamples = inferenceFrames
|
|
119
|
+
.map((x) => x.samplesPerChannel)
|
|
120
|
+
.reduce((acc, x) => acc + x, 0);
|
|
121
|
+
if (availableInferenceSamples < this.#model.windowSizeSamples) {
|
|
122
|
+
break; // not enough samples to run inference
|
|
123
|
+
}
|
|
124
|
+
const inputFrame = mergeFrames(inputFrames);
|
|
125
|
+
const inferenceFrame = mergeFrames(inferenceFrames);
|
|
126
|
+
// convert data to f32
|
|
127
|
+
inferenceData = Float32Array.from(inferenceFrame.data.subarray(0, this.#model.windowSizeSamples), (x) => x / 32767);
|
|
128
|
+
const p = await this.#model
|
|
129
|
+
.run(inferenceData)
|
|
130
|
+
.then((data) => this.#expFilter.apply(1, data));
|
|
131
|
+
const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;
|
|
132
|
+
pubCurrentSample += this.#model.windowSizeSamples;
|
|
133
|
+
pubTimestamp += windowDuration;
|
|
134
|
+
const resamplingRatio = pubSampleRate / this.#model.sampleRate;
|
|
135
|
+
const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
|
|
136
|
+
const toCopyInt = Math.trunc(toCopy);
|
|
137
|
+
inputCopyRemainingFrac = toCopy - toCopyInt;
|
|
138
|
+
// copy the inference window to the speech buffer
|
|
139
|
+
const availableSpace = speechBuffer.length - speechBufferIndex;
|
|
140
|
+
const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
|
|
141
|
+
if (toCopyBuffer > 0) {
|
|
142
|
+
speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
143
|
+
}
|
|
144
|
+
else if (!speechBufferMaxReached) {
|
|
145
|
+
speechBufferMaxReached = true;
|
|
146
|
+
this.#logger.warn('maxBufferedSpeech reached, ignoring further data for the current speech input');
|
|
147
|
+
}
|
|
148
|
+
const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
|
|
149
|
+
this.#extraInferenceTime = Math.max(0, this.#extraInferenceTime + inferenceDuration - windowDuration);
|
|
150
|
+
if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {
|
|
151
|
+
this.#logger
|
|
152
|
+
.child({ delay: this.#extraInferenceTime })
|
|
153
|
+
.warn('inference is slower than realtime');
|
|
154
|
+
}
|
|
155
|
+
if (pubSpeaking) {
|
|
156
|
+
pubSpeechDuration += inferenceDuration;
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
pubSilenceDuration += inferenceDuration;
|
|
160
|
+
}
|
|
161
|
+
this.queue.put({
|
|
162
|
+
type: VADEventType.INFERENCE_DONE,
|
|
163
|
+
samplesIndex: pubCurrentSample,
|
|
164
|
+
timestamp: pubTimestamp,
|
|
165
|
+
silenceDuration: pubSilenceDuration,
|
|
166
|
+
speechDuration: pubSpeechDuration,
|
|
167
|
+
probability: p,
|
|
168
|
+
inferenceDuration,
|
|
169
|
+
frames: [
|
|
170
|
+
new AudioFrame(new Int16Array(inputFrame.data.subarray(0, toCopyInt)), pubSampleRate, 1, toCopyInt),
|
|
171
|
+
],
|
|
172
|
+
speaking: pubSpeaking,
|
|
173
|
+
});
|
|
174
|
+
const copySpeechBuffer = () => {
|
|
175
|
+
if (!speechBuffer)
|
|
176
|
+
throw new Error('speechBuffer is empty');
|
|
177
|
+
return new AudioFrame(new Int16Array(speechBuffer.subarray(0, speechBufferIndex)), pubSampleRate, 1, speechBufferIndex);
|
|
178
|
+
};
|
|
179
|
+
if (p > this.#opts.activationThreshold) {
|
|
180
|
+
speechThresholdDuration += windowDuration;
|
|
181
|
+
silenceThresholdDuration = 0;
|
|
182
|
+
if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {
|
|
183
|
+
pubSpeaking = true;
|
|
184
|
+
pubSilenceDuration = 0;
|
|
185
|
+
pubSpeechDuration = speechThresholdDuration;
|
|
186
|
+
this.queue.put({
|
|
187
|
+
type: VADEventType.START_OF_SPEECH,
|
|
188
|
+
samplesIndex: pubCurrentSample,
|
|
189
|
+
timestamp: pubTimestamp,
|
|
190
|
+
silenceDuration: pubSilenceDuration,
|
|
191
|
+
speechDuration: pubSpeechDuration,
|
|
192
|
+
probability: p,
|
|
193
|
+
inferenceDuration,
|
|
194
|
+
frames: [copySpeechBuffer()],
|
|
195
|
+
speaking: pubSpeaking,
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
else {
|
|
200
|
+
silenceThresholdDuration += windowDuration;
|
|
201
|
+
speechThresholdDuration = 0;
|
|
202
|
+
if (!pubSpeaking && speechBufferIndex <= pubPrefixPaddingSamples) {
|
|
203
|
+
const paddingData = speechBuffer.subarray(speechBufferIndex - pubPrefixPaddingSamples, speechBufferIndex);
|
|
204
|
+
speechBuffer.set(paddingData, 0);
|
|
205
|
+
speechBufferIndex = pubPrefixPaddingSamples;
|
|
206
|
+
speechBufferMaxReached = false;
|
|
207
|
+
}
|
|
208
|
+
if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {
|
|
209
|
+
pubSpeaking = false;
|
|
210
|
+
pubSpeechDuration = 0;
|
|
211
|
+
pubSilenceDuration = silenceThresholdDuration;
|
|
212
|
+
this.queue.put({
|
|
213
|
+
type: VADEventType.END_OF_SPEECH,
|
|
214
|
+
samplesIndex: pubCurrentSample,
|
|
215
|
+
timestamp: pubTimestamp,
|
|
216
|
+
silenceDuration: pubSilenceDuration,
|
|
217
|
+
speechDuration: pubSpeechDuration,
|
|
218
|
+
probability: p,
|
|
219
|
+
inferenceDuration,
|
|
220
|
+
frames: [copySpeechBuffer()],
|
|
221
|
+
speaking: pubSpeaking,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
inputFrames = [];
|
|
226
|
+
inferenceFrames = [];
|
|
227
|
+
if (inputFrame.data.length > toCopyInt) {
|
|
228
|
+
const data = new Int16Array(inputFrame.data.subarray(toCopyInt));
|
|
229
|
+
inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
|
|
230
|
+
}
|
|
231
|
+
if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
|
|
232
|
+
const data = new Int16Array(inferenceFrame.data.subarray(this.#model.windowSizeSamples));
|
|
233
|
+
inferenceFrames.push(new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)));
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
//# sourceMappingURL=vad.js.map
|
package/dist/vad.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vad.js","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EACL,SAAS,EACT,YAAY,EACZ,SAAS,IAAI,UAAU,EACvB,GAAG,IAAI,OAAO,EACd,GAAG,EACH,WAAW,GACZ,MAAM,iBAAiB,CAAC;AACzB,OAAO,EAAE,UAAU,EAAE,cAAc,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAGtF,OAAO,EAAE,SAAS,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AAEjE,MAAM,wBAAwB,GAAG,GAAG,CAAC,CAAC,gBAAgB;AAmBtD,MAAM,iBAAiB,GAAe;IACpC,iBAAiB,EAAE,EAAE;IACrB,kBAAkB,EAAE,GAAG;IACvB,qBAAqB,EAAE,GAAG;IAC1B,iBAAiB,EAAE,KAAK;IACxB,mBAAmB,EAAE,GAAG;IACxB,UAAU,EAAE,KAAK;IACjB,QAAQ,EAAE,IAAI;CACf,CAAC;AAEF,MAAM,OAAO,GAAI,SAAQ,OAAO;IAC9B,QAAQ,CAAmB;IAC3B,KAAK,CAAa;IAElB,YAAY,OAAyB,EAAE,IAAgB;QACrD,KAAK,CAAC,EAAE,cAAc,EAAE,EAAE,EAAE,CAAC,CAAC;QAC9B,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;QACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;IACpB,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;IACH,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,iBAAiB;QACxC,MAAM,OAAO,GAAG,MAAM,mBAAmB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACzD,OAAO,IAAI,GAAG,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;IAChC,CAAC;IAED,MAAM;QACJ,OAAO,IAAI,SAAS,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,SAAS,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC;IACxF,CAAC;CACF;AAED,MAAM,OAAO,SAAU,SAAQ,UAAU;IACvC,KAAK,CAAa;IAClB,MAAM,CAAY;IAClB,KAAK,CAAgB;IACrB,UAAU,GAAG,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC;IACjC,mBAAmB,GAAG,CAAC,CAAC;IACxB,OAAO,GAAG,GAAG,EAAE,CAAC;IAEhB,YAAY,IAAgB,EAAE,KAAgB;QAC5C,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;QAClB,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC;QAEpB,IAAI,CAAC,KAAK,GAAG,IAAI,OAAO,CAAC,KAAK,IAAI,EAAE;YAClC,IAAI,aAAa,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC;YAEpE,iDAAiD;YACjD,IAAI,YAAY,GAAsB,IAAI,CAAC;YAC3C,IAAI,sBAAsB,GAAG,KAAK,CAAC;YACnC,IAAI,iBAAiB,GAAG,CAAC,CAAC;YAE1B,2EAA2E;YAC3E,IAAI,WAAW,GAAG,KAAK,CAAC;YACxB,IAAI,iBAAiB,GAAG,CAAC,CAAC;YAC1B,IAAI,kBAAkB,GAAG,CAAC,CAAC;YAC3B,IAAI,gBAAgB,GAAG,CAAC,CAAC;YACzB,IAAI,YAAY,GAAG,CAAC,CAAC;YACrB,IAAI,aAAa,GAAG,CAAC,CAAC;YACtB,IAAI,uBAAuB,GAAG,CAAC,CAAC,CAAC,kCAAkC;YAEnE,IAAI,uBAAuB,GAAG,CAAC,CAAC;YAChC,IAAI,wBAAwB,GAAG,CAAC,CAAC;YAEjC,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,IAAI,eAAe,GAAiB,EAAE,CAAC;YACvC,IAAI,SAAS,GAA0B,IAAI,CAAC;YAE5C,kEAAkE;YAClE,IAAI,sBAAsB,GAAG,GAAG,CAAC;YAEjC,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBACrC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;oBAC9B,SAAS,CAAC,gCAAgC;gBAC5C,CAAC;gBAED,IAAI,CAAC,aAAa,IAAI,CAAC,YAAY,EAAE,CAAC;oBACpC,aAAa,GAAG,KAAK,CAAC,UAAU,CAAC;oBACjC,uBAAuB,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,qBAAqB,GAAG,aAAa,CAAC,CAAC;oBAEtF,YAAY,GAAG,IAAI,UAAU,CAC3B,CAAC,IAAI,CAAC,KAAK,CAAC,iBAAiB,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,GAAG,aAAa,CAClF,CAAC;oBAEF,IAAI,IAAI,CAAC,KAAK,CAAC,UAAU,KAAK,aAAa,EAAE,CAAC;wBAC5C,yEAAyE;wBACzE,iCAAiC;wBACjC,SAAS,GAAG,IAAI,cAAc,CAC5B,aAAa,EACb,IAAI,CAAC,KAAK,CAAC,UAAU,EACrB,CAAC,EACD,qBAAqB,CAAC,KAAK,CAC5B,CAAC;oBACJ,CAAC;gBACH,CAAC;qBAAM,IAAI,KAAK,CAAC,UAAU,KAAK,aAAa,EAAE,CAAC;oBAC9C,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,4DAA4D,CAAC,CAAC;oBACjF,SAAS;gBACX,CAAC;gBAED,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBACxB,IAAI,SAAS,EAAE,CAAC;oBACd,eAAe,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;gBACjD,CAAC;qBAAM,CAAC;oBACN,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBAC9B,CAAC;gBAED,OAAO,IAAI,EAAE,CAAC;oBACZ,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;oBAC1C,MAAM,yBAAyB,GAAG,eAAe;yBAC9C,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,iBAAiB,CAAC;yBAC/B,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;oBAElC,IAAI,yBAAyB,GAAG,IAAI,CAAC,MAAM,CAAC,iBAAiB,EAAE,CAAC;wBAC9D,MAAM,CAAC,sCAAsC;oBAC/C,CAAC;oBAED,MAAM,UAAU,GAAG,WAAW,CAAC,WAAW,CAAC,CAAC;oBAC5C,MAAM,cAAc,GAAG,WAAW,CAAC,eAAe,CAAC,CAAC;oBAEpD,sBAAsB;oBACtB,aAAa,GAAG,YAAY,CAAC,IAAI,CAC/B,cAAc,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC,EAC9D,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,KAAK,CACjB,CAAC;oBAEF,MAAM,CAAC,GAAG,MAAM,IAAI,CAAC,MAAM;yBACxB,GAAG,CAAC,aAAa,CAAC;yBAClB,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;oBAElD,MAAM,cAAc,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,iBAAiB,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,IAAI,CAAC;oBACtF,gBAAgB,IAAI,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC;oBAClD,YAAY,IAAI,cAAc,CAAC;oBAC/B,MAAM,eAAe,GAAG,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC;oBAC/D,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,iBAAiB,GAAG,eAAe,GAAG,sBAAsB,CAAC;oBACxF,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;oBACrC,sBAAsB,GAAG,MAAM,GAAG,SAAS,CAAC;oBAE5C,iDAAiD;oBACjD,MAAM,cAAc,GAAG,YAAY,CAAC,MAAM,GAAG,iBAAiB,CAAC;oBAC/D,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;oBAC7E,IAAI,YAAY,GAAG,CAAC,EAAE,CAAC;wBACrB,YAAY,CAAC,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,YAAY,CAAC,EAAE,iBAAiB,CAAC,CAAC;oBACjF,CAAC;yBAAM,IAAI,CAAC,sBAAsB,EAAE,CAAC;wBACnC,sBAAsB,GAAG,IAAI,CAAC;wBAC9B,IAAI,CAAC,OAAO,CAAC,IAAI,CACf,+EAA+E,CAChF,CAAC;oBACJ,CAAC;oBAED,MAAM,iBAAiB,GAAG,MAAM,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,GAAG,SAAS,CAAC,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC;oBAC1F,IAAI,CAAC,mBAAmB,GAAG,IAAI,CAAC,GAAG,CACjC,CAAC,EACD,IAAI,CAAC,mBAAmB,GAAG,iBAAiB,GAAG,cAAc,CAC9D,CAAC;oBACF,IAAI,IAAI,CAAC,mBAAmB,GAAG,wBAAwB,EAAE,CAAC;wBACxD,IAAI,CAAC,OAAO;6BACT,KAAK,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,mBAAmB,EAAE,CAAC;6BAC1C,IAAI,CAAC,mCAAmC,CAAC,CAAC;oBAC/C,CAAC;oBAED,IAAI,WAAW,EAAE,CAAC;wBAChB,iBAAiB,IAAI,iBAAiB,CAAC;oBACzC,CAAC;yBAAM,CAAC;wBACN,kBAAkB,IAAI,iBAAiB,CAAC;oBAC1C,CAAC;oBAED,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC;wBACb,IAAI,EAAE,YAAY,CAAC,cAAc;wBACjC,YAAY,EAAE,gBAAgB;wBAC9B,SAAS,EAAE,YAAY;wBACvB,eAAe,EAAE,kBAAkB;wBACnC,cAAc,EAAE,iBAAiB;wBACjC,WAAW,EAAE,CAAC;wBACd,iBAAiB;wBACjB,MAAM,EAAE;4BACN,IAAI,UAAU,CACZ,IAAI,UAAU,CAAC,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC,EACtD,aAAa,EACb,CAAC,EACD,SAAS,CACV;yBACF;wBACD,QAAQ,EAAE,WAAW;qBACtB,CAAC,CAAC;oBAEH,MAAM,gBAAgB,GAAG,GAAe,EAAE;wBACxC,IAAI,CAAC,YAAY;4BAAE,MAAM,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC;wBAC5D,OAAO,IAAI,UAAU,CACnB,IAAI,UAAU,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,EAC3D,aAAa,EACb,CAAC,EACD,iBAAiB,CAClB,CAAC;oBACJ,CAAC,CAAC;oBAEF,IAAI,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,EAAE,CAAC;wBACvC,uBAAuB,IAAI,cAAc,CAAC;wBAC1C,wBAAwB,GAAG,CAAC,CAAC;wBAC7B,IAAI,CAAC,WAAW,IAAI,uBAAuB,IAAI,IAAI,CAAC,KAAK,CAAC,iBAAiB,EAAE,CAAC;4BAC5E,WAAW,GAAG,IAAI,CAAC;4BACnB,kBAAkB,GAAG,CAAC,CAAC;4BACvB,iBAAiB,GAAG,uBAAuB,CAAC;4BAE5C,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC;gCACb,IAAI,EAAE,YAAY,CAAC,eAAe;gCAClC,YAAY,EAAE,gBAAgB;gCAC9B,SAAS,EAAE,YAAY;gCACvB,eAAe,EAAE,kBAAkB;gCACnC,cAAc,EAAE,iBAAiB;gCACjC,WAAW,EAAE,CAAC;gCACd,iBAAiB;gCACjB,MAAM,EAAE,CAAC,gBAAgB,EAAE,CAAC;gCAC5B,QAAQ,EAAE,WAAW;6BACtB,CAAC,CAAC;wBACL,CAAC;oBACH,CAAC;yBAAM,CAAC;wBACN,wBAAwB,IAAI,cAAc,CAAC;wBAC3C,uBAAuB,GAAG,CAAC,CAAC;wBAE5B,IAAI,CAAC,WAAW,IAAI,iBAAiB,IAAI,uBAAuB,EAAE,CAAC;4BACjE,MAAM,WAAW,GAAG,YAAY,CAAC,QAAQ,CACvC,iBAAiB,GAAG,uBAAuB,EAC3C,iBAAiB,CAClB,CAAC;4BACF,YAAY,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;4BACjC,iBAAiB,GAAG,uBAAuB,CAAC;4BAC5C,sBAAsB,GAAG,KAAK,CAAC;wBACjC,CAAC;wBAED,IAAI,WAAW,IAAI,wBAAwB,GAAG,IAAI,CAAC,KAAK,CAAC,kBAAkB,EAAE,CAAC;4BAC5E,WAAW,GAAG,KAAK,CAAC;4BACpB,iBAAiB,GAAG,CAAC,CAAC;4BACtB,kBAAkB,GAAG,wBAAwB,CAAC;4BAE9C,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC;gCACb,IAAI,EAAE,YAAY,CAAC,aAAa;gCAChC,YAAY,EAAE,gBAAgB;gCAC9B,SAAS,EAAE,YAAY;gCACvB,eAAe,EAAE,kBAAkB;gCACnC,cAAc,EAAE,iBAAiB;gCACjC,WAAW,EAAE,CAAC;gCACd,iBAAiB;gCACjB,MAAM,EAAE,CAAC,gBAAgB,EAAE,CAAC;gCAC5B,QAAQ,EAAE,WAAW;6BACtB,CAAC,CAAC;wBACL,CAAC;oBACH,CAAC;oBAED,WAAW,GAAG,EAAE,CAAC;oBACjB,eAAe,GAAG,EAAE,CAAC;oBAErB,IAAI,UAAU,CAAC,IAAI,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;wBACvC,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC;wBACjE,WAAW,CAAC,IAAI,CAAC,IAAI,UAAU,CAAC,IAAI,EAAE,aAAa,EAAE,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;oBACxF,CAAC;oBACD,IAAI,cAAc,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,iBAAiB,EAAE,CAAC;wBAC/D,MAAM,IAAI,GAAG,IAAI,UAAU,CACzB,cAAc,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAC5D,CAAC;wBACF,eAAe,CAAC,IAAI,CAClB,IAAI,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,EAAE,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAC5E,CAAC;oBACJ,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;CACF"}
|
package/package.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@livekit/agents-plugin-silero",
|
|
3
|
+
"version": "0.4.0",
|
|
4
|
+
"description": "Silero voice activity detection LiveKit Node Agents",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"author": "LiveKit",
|
|
8
|
+
"type": "module",
|
|
9
|
+
"devDependencies": {
|
|
10
|
+
"@microsoft/api-extractor": "^7.35.0",
|
|
11
|
+
"@types/ws": "^8.5.10",
|
|
12
|
+
"onnxruntime-common": "^1.19.2",
|
|
13
|
+
"typescript": "^5.0.0"
|
|
14
|
+
},
|
|
15
|
+
"dependencies": {
|
|
16
|
+
"@livekit/rtc-node": "^0.11.1",
|
|
17
|
+
"onnxruntime-node": "^1.19.2",
|
|
18
|
+
"ws": "^8.16.0",
|
|
19
|
+
"@livekit/agents": "0.4.0"
|
|
20
|
+
},
|
|
21
|
+
"scripts": {
|
|
22
|
+
"build": "tsc && cp src/*.onnx dist/",
|
|
23
|
+
"clean": "rm -rf dist",
|
|
24
|
+
"clean:build": "pnpm clean && pnpm build",
|
|
25
|
+
"lint": "eslint -f unix \"src/**/*.{ts,js}\"",
|
|
26
|
+
"api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript",
|
|
27
|
+
"api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose"
|
|
28
|
+
}
|
|
29
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { InferenceSession, Tensor } from 'onnxruntime-node';
|
|
5
|
+
|
|
6
|
+
export type SampleRate = 8000 | 16000;
|
|
7
|
+
|
|
8
|
+
export const newInferenceSession = (forceCPU: boolean) => {
|
|
9
|
+
return InferenceSession.create(new URL('silero_vad.onnx', import.meta.url).pathname, {
|
|
10
|
+
interOpNumThreads: 1,
|
|
11
|
+
intraOpNumThreads: 1,
|
|
12
|
+
executionMode: 'sequential',
|
|
13
|
+
executionProviders: forceCPU ? [{ name: 'cpu' }] : undefined,
|
|
14
|
+
});
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
export class OnnxModel {
|
|
18
|
+
#session: InferenceSession;
|
|
19
|
+
#sampleRate: number;
|
|
20
|
+
#windowSizeSamples: number;
|
|
21
|
+
#contextSize: number;
|
|
22
|
+
#sampleRateNd: BigInt64Array;
|
|
23
|
+
#context: Float32Array;
|
|
24
|
+
// #state: Float32Array;
|
|
25
|
+
#rnnState: Float32Array;
|
|
26
|
+
#inputBuffer: Float32Array;
|
|
27
|
+
|
|
28
|
+
constructor(session: InferenceSession, sampleRate: SampleRate) {
|
|
29
|
+
this.#session = session;
|
|
30
|
+
this.#sampleRate = sampleRate;
|
|
31
|
+
|
|
32
|
+
switch (sampleRate) {
|
|
33
|
+
case 8000:
|
|
34
|
+
this.#windowSizeSamples = 256;
|
|
35
|
+
this.#contextSize = 32;
|
|
36
|
+
break;
|
|
37
|
+
case 16000:
|
|
38
|
+
this.#windowSizeSamples = 512;
|
|
39
|
+
this.#contextSize = 64;
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
this.#sampleRateNd = BigInt64Array.from([BigInt(sampleRate)]);
|
|
44
|
+
this.#context = new Float32Array(this.#contextSize);
|
|
45
|
+
this.#rnnState = new Float32Array(2 * 1 * 128);
|
|
46
|
+
this.#inputBuffer = new Float32Array(this.#contextSize + this.#windowSizeSamples);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
get sampleRate(): number {
|
|
50
|
+
return this.#sampleRate;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
get windowSizeSamples(): number {
|
|
54
|
+
return this.#windowSizeSamples;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
get contextSize(): number {
|
|
58
|
+
return this.#contextSize;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async run(x: Float32Array): Promise<number> {
|
|
62
|
+
this.#inputBuffer.set(this.#context, 0);
|
|
63
|
+
this.#inputBuffer.set(x, this.#contextSize);
|
|
64
|
+
|
|
65
|
+
return await this.#session
|
|
66
|
+
.run({
|
|
67
|
+
input: new Tensor('float32', this.#inputBuffer, [
|
|
68
|
+
1,
|
|
69
|
+
this.#contextSize + this.#windowSizeSamples,
|
|
70
|
+
]),
|
|
71
|
+
state: new Tensor('float32', this.#rnnState, [2, 1, 128]),
|
|
72
|
+
sr: new Tensor('int64', this.#sampleRateNd),
|
|
73
|
+
})
|
|
74
|
+
.then((result) => {
|
|
75
|
+
// this.#state = result.output.data as Float32Array,
|
|
76
|
+
this.#context = this.#inputBuffer.subarray(0, this.#contextSize);
|
|
77
|
+
return (result.output.data as Float32Array).at(0)!;
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
}
|