@livekit/agents-plugin-silero 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/vad.cjs +90 -32
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.ts +17 -0
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +90 -32
- package/dist/vad.js.map +1 -1
- package/package.json +4 -4
- package/src/vad.ts +102 -34
package/dist/vad.cjs
CHANGED
|
@@ -38,11 +38,27 @@ const defaultVADOptions = {
|
|
|
38
38
|
class VAD extends import_agents.VAD {
|
|
39
39
|
#session;
|
|
40
40
|
#opts;
|
|
41
|
+
#streams;
|
|
41
42
|
label = "silero.VAD";
|
|
42
43
|
constructor(session, opts) {
|
|
43
44
|
super({ updateInterval: 32 });
|
|
44
45
|
this.#session = session;
|
|
45
46
|
this.#opts = opts;
|
|
47
|
+
this.#streams = [];
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Updates the VAD options with new values.
|
|
51
|
+
*
|
|
52
|
+
* @param opts - Partial options object containing the values to update
|
|
53
|
+
* @remarks
|
|
54
|
+
* This method will merge the provided options with existing options and update all active streams.
|
|
55
|
+
* Only the properties specified in opts will be updated, other properties retain their current values.
|
|
56
|
+
*/
|
|
57
|
+
updateOptions(opts) {
|
|
58
|
+
this.#opts = { ...this.#opts, ...opts };
|
|
59
|
+
for (const stream of this.#streams) {
|
|
60
|
+
stream.updateOptions(this.#opts);
|
|
61
|
+
}
|
|
46
62
|
}
|
|
47
63
|
/**
|
|
48
64
|
* Load and initialize the Silero VAD model.
|
|
@@ -76,12 +92,22 @@ class VAD extends import_agents.VAD {
|
|
|
76
92
|
return new VAD(session, mergedOpts);
|
|
77
93
|
}
|
|
78
94
|
stream() {
|
|
79
|
-
|
|
95
|
+
const stream = new VADStream(
|
|
96
|
+
this,
|
|
97
|
+
this.#opts,
|
|
98
|
+
new import_onnx_model.OnnxModel(this.#session, this.#opts.sampleRate)
|
|
99
|
+
);
|
|
100
|
+
this.#streams.push(stream);
|
|
101
|
+
return stream;
|
|
80
102
|
}
|
|
81
103
|
}
|
|
82
104
|
class VADStream extends import_agents.VADStream {
|
|
83
105
|
#opts;
|
|
84
106
|
#model;
|
|
107
|
+
#inputSampleRate;
|
|
108
|
+
#speechBuffer;
|
|
109
|
+
#speechBufferMaxReached;
|
|
110
|
+
#prefixPaddingSamples;
|
|
85
111
|
#task;
|
|
86
112
|
#expFilter = new import_agents.ExpFilter(0.35);
|
|
87
113
|
#extraInferenceTime = 0;
|
|
@@ -90,18 +116,18 @@ class VADStream extends import_agents.VADStream {
|
|
|
90
116
|
super(vad);
|
|
91
117
|
this.#opts = opts;
|
|
92
118
|
this.#model = model;
|
|
119
|
+
this.#inputSampleRate = 0;
|
|
120
|
+
this.#speechBuffer = null;
|
|
121
|
+
this.#speechBufferMaxReached = false;
|
|
122
|
+
this.#prefixPaddingSamples = 0;
|
|
93
123
|
this.#task = new Promise(async () => {
|
|
94
124
|
let inferenceData = new Float32Array(this.#model.windowSizeSamples);
|
|
95
|
-
let speechBuffer = null;
|
|
96
|
-
let speechBufferMaxReached = false;
|
|
97
125
|
let speechBufferIndex = 0;
|
|
98
126
|
let pubSpeaking = false;
|
|
99
127
|
let pubSpeechDuration = 0;
|
|
100
128
|
let pubSilenceDuration = 0;
|
|
101
129
|
let pubCurrentSample = 0;
|
|
102
130
|
let pubTimestamp = 0;
|
|
103
|
-
let pubSampleRate = 0;
|
|
104
|
-
let pubPrefixPaddingSamples = 0;
|
|
105
131
|
let speechThresholdDuration = 0;
|
|
106
132
|
let silenceThresholdDuration = 0;
|
|
107
133
|
let inputFrames = [];
|
|
@@ -112,24 +138,23 @@ class VADStream extends import_agents.VADStream {
|
|
|
112
138
|
if (typeof frame === "symbol") {
|
|
113
139
|
continue;
|
|
114
140
|
}
|
|
115
|
-
if (!
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
this.#opts.prefixPaddingDuration *
|
|
119
|
-
);
|
|
120
|
-
speechBuffer = new Int16Array(
|
|
121
|
-
this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples
|
|
141
|
+
if (!this.#inputSampleRate || !this.#speechBuffer) {
|
|
142
|
+
this.#inputSampleRate = frame.sampleRate;
|
|
143
|
+
this.#prefixPaddingSamples = Math.trunc(
|
|
144
|
+
this.#opts.prefixPaddingDuration * this.#inputSampleRate / 1e3
|
|
122
145
|
);
|
|
123
|
-
|
|
146
|
+
const bufferSize = Math.trunc(this.#opts.maxBufferedSpeech * this.#inputSampleRate / 1e3) + this.#prefixPaddingSamples;
|
|
147
|
+
this.#speechBuffer = new Int16Array(bufferSize);
|
|
148
|
+
if (this.#opts.sampleRate !== this.#inputSampleRate) {
|
|
124
149
|
resampler = new import_rtc_node.AudioResampler(
|
|
125
|
-
|
|
150
|
+
this.#inputSampleRate,
|
|
126
151
|
this.#opts.sampleRate,
|
|
127
152
|
1,
|
|
128
153
|
import_rtc_node.AudioResamplerQuality.QUICK
|
|
129
154
|
// VAD doesn't need high quality
|
|
130
155
|
);
|
|
131
156
|
}
|
|
132
|
-
} else if (frame.sampleRate !==
|
|
157
|
+
} else if (frame.sampleRate !== this.#inputSampleRate) {
|
|
133
158
|
this.#logger.error("a frame with a different sample rate was already published");
|
|
134
159
|
continue;
|
|
135
160
|
}
|
|
@@ -155,17 +180,17 @@ class VADStream extends import_agents.VADStream {
|
|
|
155
180
|
const windowDuration = this.#model.windowSizeSamples / this.#opts.sampleRate * 1e3;
|
|
156
181
|
pubCurrentSample += this.#model.windowSizeSamples;
|
|
157
182
|
pubTimestamp += windowDuration;
|
|
158
|
-
const resamplingRatio =
|
|
183
|
+
const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;
|
|
159
184
|
const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
|
|
160
185
|
const toCopyInt = Math.trunc(toCopy);
|
|
161
186
|
inputCopyRemainingFrac = toCopy - toCopyInt;
|
|
162
|
-
const availableSpace = speechBuffer.length - speechBufferIndex;
|
|
187
|
+
const availableSpace = this.#speechBuffer.length - speechBufferIndex;
|
|
163
188
|
const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
|
|
164
189
|
if (toCopyBuffer > 0) {
|
|
165
|
-
speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
190
|
+
this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
166
191
|
speechBufferIndex += toCopyBuffer;
|
|
167
|
-
} else if (!speechBufferMaxReached) {
|
|
168
|
-
speechBufferMaxReached = true;
|
|
192
|
+
} else if (!this.#speechBufferMaxReached) {
|
|
193
|
+
this.#speechBufferMaxReached = true;
|
|
169
194
|
this.#logger.warn(
|
|
170
195
|
"maxBufferedSpeech reached, ignoring further data for the current speech input"
|
|
171
196
|
);
|
|
@@ -192,30 +217,35 @@ class VADStream extends import_agents.VADStream {
|
|
|
192
217
|
probability: p,
|
|
193
218
|
inferenceDuration,
|
|
194
219
|
frames: [
|
|
195
|
-
new import_rtc_node.AudioFrame(
|
|
220
|
+
new import_rtc_node.AudioFrame(
|
|
221
|
+
inputFrame.data.subarray(0, toCopyInt),
|
|
222
|
+
this.#inputSampleRate,
|
|
223
|
+
1,
|
|
224
|
+
toCopyInt
|
|
225
|
+
)
|
|
196
226
|
],
|
|
197
227
|
speaking: pubSpeaking,
|
|
198
228
|
rawAccumulatedSilence: silenceThresholdDuration,
|
|
199
229
|
rawAccumulatedSpeech: speechThresholdDuration
|
|
200
230
|
});
|
|
201
231
|
const resetWriteCursor = () => {
|
|
202
|
-
if (!speechBuffer) throw new Error("speechBuffer is empty");
|
|
203
|
-
if (speechBufferIndex <=
|
|
232
|
+
if (!this.#speechBuffer) throw new Error("speechBuffer is empty");
|
|
233
|
+
if (speechBufferIndex <= this.#prefixPaddingSamples) {
|
|
204
234
|
return;
|
|
205
235
|
}
|
|
206
|
-
const paddingData = speechBuffer.subarray(
|
|
207
|
-
speechBufferIndex -
|
|
236
|
+
const paddingData = this.#speechBuffer.subarray(
|
|
237
|
+
speechBufferIndex - this.#prefixPaddingSamples,
|
|
208
238
|
speechBufferIndex
|
|
209
239
|
);
|
|
210
|
-
speechBuffer.set(paddingData, 0);
|
|
211
|
-
speechBufferIndex =
|
|
212
|
-
speechBufferMaxReached = false;
|
|
240
|
+
this.#speechBuffer.set(paddingData, 0);
|
|
241
|
+
speechBufferIndex = this.#prefixPaddingSamples;
|
|
242
|
+
this.#speechBufferMaxReached = false;
|
|
213
243
|
};
|
|
214
244
|
const copySpeechBuffer = () => {
|
|
215
|
-
if (!speechBuffer) throw new Error("speechBuffer is empty");
|
|
245
|
+
if (!this.#speechBuffer) throw new Error("speechBuffer is empty");
|
|
216
246
|
return new import_rtc_node.AudioFrame(
|
|
217
|
-
speechBuffer.subarray(0, speechBufferIndex),
|
|
218
|
-
|
|
247
|
+
this.#speechBuffer.subarray(0, speechBufferIndex),
|
|
248
|
+
this.#inputSampleRate,
|
|
219
249
|
1,
|
|
220
250
|
speechBufferIndex
|
|
221
251
|
);
|
|
@@ -271,7 +301,9 @@ class VADStream extends import_agents.VADStream {
|
|
|
271
301
|
inferenceFrames = [];
|
|
272
302
|
if (inputFrame.data.length > toCopyInt) {
|
|
273
303
|
const data = inputFrame.data.subarray(toCopyInt);
|
|
274
|
-
inputFrames.push(
|
|
304
|
+
inputFrames.push(
|
|
305
|
+
new import_rtc_node.AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2))
|
|
306
|
+
);
|
|
275
307
|
}
|
|
276
308
|
if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
|
|
277
309
|
const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
|
|
@@ -283,6 +315,32 @@ class VADStream extends import_agents.VADStream {
|
|
|
283
315
|
}
|
|
284
316
|
});
|
|
285
317
|
}
|
|
318
|
+
/**
|
|
319
|
+
* Update the VAD options
|
|
320
|
+
*
|
|
321
|
+
* @param opts - Partial options object containing the values to update
|
|
322
|
+
* @remarks
|
|
323
|
+
* This method allows you to update the VAD options after the VAD object has been created
|
|
324
|
+
*/
|
|
325
|
+
updateOptions(opts) {
|
|
326
|
+
const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;
|
|
327
|
+
this.#opts = { ...this.#opts, ...opts };
|
|
328
|
+
if (this.#inputSampleRate) {
|
|
329
|
+
if (this.#speechBuffer === null) throw new Error("speechBuffer is null");
|
|
330
|
+
this.#prefixPaddingSamples = Math.trunc(
|
|
331
|
+
this.#opts.prefixPaddingDuration * this.#inputSampleRate / 1e3
|
|
332
|
+
);
|
|
333
|
+
const bufferSize = Math.trunc(this.#opts.maxBufferedSpeech * this.#inputSampleRate / 1e3) + this.#prefixPaddingSamples;
|
|
334
|
+
const resizedBuffer = new Int16Array(bufferSize);
|
|
335
|
+
resizedBuffer.set(
|
|
336
|
+
this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize))
|
|
337
|
+
);
|
|
338
|
+
this.#speechBuffer = resizedBuffer;
|
|
339
|
+
if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {
|
|
340
|
+
this.#speechBufferMaxReached = false;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
286
344
|
}
|
|
287
345
|
// Annotate the CommonJS export names for ESM import in node:
|
|
288
346
|
0 && (module.exports = {
|
package/dist/vad.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n return new VADStream(this, this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBuffer: Int16Array | null = null;\n let speechBufferMaxReached = false;\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let pubSampleRate = 0;\n let pubPrefixPaddingSamples = 0; // size in samples of padding data\n\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!pubSampleRate || !speechBuffer) {\n pubSampleRate = frame.sampleRate;\n pubPrefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * pubSampleRate) / 1000,\n );\n\n speechBuffer = new Int16Array(\n this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples,\n );\n\n if (this.#opts.sampleRate !== pubSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n pubSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== pubSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = pubSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!speechBufferMaxReached) {\n speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= pubPrefixPaddingSamples) {\n return;\n }\n\n const paddingData = speechBuffer.subarray(\n speechBufferIndex - pubPrefixPaddingSamples,\n speechBufferIndex,\n );\n speechBuffer.set(paddingData, 0);\n speechBufferIndex = pubPrefixPaddingSamples;\n speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n speechBuffer.subarray(0, speechBufferIndex),\n pubSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AACP,sBAAkE;AAGlE,wBAA+C;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,cAAAA,IAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AAAA,EACf;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,UAAM,uCAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,WAAO,IAAI,UAAU,MAAM,KAAK,OAAO,IAAI,4BAAU,KAAK,UAAU,KAAK,MAAM,UAAU,CAAC;AAAA,EAC5F;AACF;AAEO,MAAM,kBAAkB,cAAAC,UAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,wBAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,cAAU,mBAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAEd,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,eAAkC;AACtC,UAAI,yBAAyB;AAC7B,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,gBAAgB;AACpB,UAAI,0BAA0B;AAE9B,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,iBAAiB,CAAC,cAAc;AACnC,0BAAgB,MAAM;AACtB,oCAA0B,KAAK;AAAA,YAC5B,KAAK,MAAM,wBAAwB,gBAAiB;AAAA,UACvD;AAEA,yBAAe,IAAI;AAAA,YACjB,KAAK,MAAM,oBAAoB,gBAAgB;AAAA,UACjD;AAEA,cAAI,KAAK,MAAM,eAAe,eAAe;AAG3C,wBAAY,IAAI;AAAA,cACd;AAAA,cACA,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sCAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,eAAe;AAC7C,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,iBAAa,2BAAY,WAAW;AAC1C,gBAAM,qBAAiB,2BAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,gBAAgB,KAAK,OAAO;AACpD,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,aAAa,SAAS;AAC7C,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,yBAAa,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AAC7E,iCAAqB;AAAA,UACvB,WAAW,CAAC,wBAAwB;AAClC,qCAAyB;AACzB,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,2BAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI,2BAAW,WAAW,KAAK,SAAS,GAAG,SAAS,GAAG,eAAe,GAAG,SAAS;AAAA,YACpF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,gBAAI,qBAAqB,yBAAyB;AAChD;AAAA,YACF;AAEA,kBAAM,cAAc,aAAa;AAAA,cAC/B,oBAAoB;AAAA,cACpB;AAAA,YACF;AACA,yBAAa,IAAI,aAAa,CAAC;AAC/B,gCAAoB;AACpB,qCAAyB;AAAA,UAC3B;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,mBAAO,IAAI;AAAA,cACT,aAAa,SAAS,GAAG,iBAAiB;AAAA,cAC1C;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY,KAAK,IAAI,2BAAW,MAAM,eAAe,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC;AAAA,UACtF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,2BAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AACF;","names":["baseVAD","baseStream"]}
|
|
1
|
+
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n #streams: VADStream[];\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n this.#streams = [];\n }\n\n /**\n * Updates the VAD options with new values.\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method will merge the provided options with existing options and update all active streams.\n * Only the properties specified in opts will be updated, other properties retain their current values.\n */\n updateOptions(opts: Partial<VADOptions>): void {\n this.#opts = { ...this.#opts, ...opts };\n for (const stream of this.#streams) {\n stream.updateOptions(this.#opts);\n }\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n const stream = new VADStream(\n this,\n this.#opts,\n new OnnxModel(this.#session, this.#opts.sampleRate),\n );\n this.#streams.push(stream);\n return stream;\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #inputSampleRate: number;\n #speechBuffer: Int16Array | null;\n #speechBufferMaxReached: boolean;\n #prefixPaddingSamples: number;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n this.#inputSampleRate = 0;\n this.#speechBuffer = null;\n this.#speechBufferMaxReached = false;\n this.#prefixPaddingSamples = 0;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!this.#inputSampleRate || !this.#speechBuffer) {\n this.#inputSampleRate = frame.sampleRate;\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n this.#speechBuffer = new Int16Array(bufferSize);\n\n if (this.#opts.sampleRate !== this.#inputSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n this.#inputSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== this.#inputSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = this.#speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!this.#speechBufferMaxReached) {\n this.#speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(\n inputFrame.data.subarray(0, toCopyInt),\n this.#inputSampleRate,\n 1,\n toCopyInt,\n ),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= this.#prefixPaddingSamples) {\n return;\n }\n\n const paddingData = this.#speechBuffer.subarray(\n speechBufferIndex - this.#prefixPaddingSamples,\n speechBufferIndex,\n );\n this.#speechBuffer.set(paddingData, 0);\n speechBufferIndex = this.#prefixPaddingSamples;\n this.#speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n this.#speechBuffer.subarray(0, speechBufferIndex),\n this.#inputSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(\n new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n\n /**\n * Update the VAD options\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method allows you to update the VAD options after the VAD object has been created\n */\n updateOptions(opts: Partial<VADOptions>) {\n const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;\n this.#opts = { ...this.#opts, ...opts };\n\n if (this.#inputSampleRate) {\n // Assert speech buffer exists\n if (this.#speechBuffer === null) throw new Error('speechBuffer is null');\n\n // Resize speech buffer\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n const resizedBuffer = new Int16Array(bufferSize);\n resizedBuffer.set(\n this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),\n );\n this.#speechBuffer = resizedBuffer;\n\n // Determine if max has been reached\n if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {\n this.#speechBufferMaxReached = false;\n }\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AACP,sBAAkE;AAGlE,wBAA+C;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,cAAAA,IAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AACb,SAAK,WAAW,CAAC;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,cAAc,MAAiC;AAC7C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AACtC,eAAW,UAAU,KAAK,UAAU;AAClC,aAAO,cAAc,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,UAAM,uCAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA,KAAK;AAAA,MACL,IAAI,4BAAU,KAAK,UAAU,KAAK,MAAM,UAAU;AAAA,IACpD;AACA,SAAK,SAAS,KAAK,MAAM;AACzB,WAAO;AAAA,EACT;AACF;AAEO,MAAM,kBAAkB,cAAAC,UAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,wBAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,cAAU,mBAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AACd,SAAK,mBAAmB;AACxB,SAAK,gBAAgB;AACrB,SAAK,0BAA0B;AAC/B,SAAK,wBAAwB;AAE7B,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,KAAK,oBAAoB,CAAC,KAAK,eAAe;AACjD,eAAK,mBAAmB,MAAM;AAC9B,eAAK,wBAAwB,KAAK;AAAA,YAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,UAC/D;AACA,gBAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,eAAK,gBAAgB,IAAI,WAAW,UAAU;AAE9C,cAAI,KAAK,MAAM,eAAe,KAAK,kBAAkB;AAGnD,wBAAY,IAAI;AAAA,cACd,KAAK;AAAA,cACL,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sCAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,KAAK,kBAAkB;AACrD,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,iBAAa,2BAAY,WAAW;AAC1C,gBAAM,qBAAiB,2BAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,KAAK,mBAAmB,KAAK,OAAO;AAC5D,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,KAAK,cAAc,SAAS;AACnD,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,iBAAK,cAAc,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AACnF,iCAAqB;AAAA,UACvB,WAAW,CAAC,KAAK,yBAAyB;AACxC,iBAAK,0BAA0B;AAC/B,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,2BAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI;AAAA,gBACF,WAAW,KAAK,SAAS,GAAG,SAAS;AAAA,gBACrC,KAAK;AAAA,gBACL;AAAA,gBACA;AAAA,cACF;AAAA,YACF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,gBAAI,qBAAqB,KAAK,uBAAuB;AACnD;AAAA,YACF;AAEA,kBAAM,cAAc,KAAK,cAAc;AAAA,cACrC,oBAAoB,KAAK;AAAA,cACzB;AAAA,YACF;AACA,iBAAK,cAAc,IAAI,aAAa,CAAC;AACrC,gCAAoB,KAAK;AACzB,iBAAK,0BAA0B;AAAA,UACjC;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,mBAAO,IAAI;AAAA,cACT,KAAK,cAAc,SAAS,GAAG,iBAAiB;AAAA,cAChD,KAAK;AAAA,cACL;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY;AAAA,cACV,IAAI,2BAAW,MAAM,KAAK,kBAAkB,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,2BAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,cAAc,MAA2B;AACvC,UAAM,uBAAuB,KAAK,MAAM;AACxC,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAEtC,QAAI,KAAK,kBAAkB;AAEzB,UAAI,KAAK,kBAAkB,KAAM,OAAM,IAAI,MAAM,sBAAsB;AAGvE,WAAK,wBAAwB,KAAK;AAAA,QAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,MAC/D;AACA,YAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,YAAM,gBAAgB,IAAI,WAAW,UAAU;AAC/C,oBAAc;AAAA,QACZ,KAAK,cAAc,SAAS,GAAG,KAAK,IAAI,KAAK,cAAc,QAAQ,UAAU,CAAC;AAAA,MAChF;AACA,WAAK,gBAAgB;AAGrB,UAAI,KAAK,MAAM,oBAAoB,sBAAsB;AACvD,aAAK,0BAA0B;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;","names":["baseVAD","baseStream"]}
|
package/dist/vad.d.ts
CHANGED
|
@@ -23,6 +23,15 @@ export declare class VAD extends baseVAD {
|
|
|
23
23
|
#private;
|
|
24
24
|
label: string;
|
|
25
25
|
constructor(session: InferenceSession, opts: VADOptions);
|
|
26
|
+
/**
|
|
27
|
+
* Updates the VAD options with new values.
|
|
28
|
+
*
|
|
29
|
+
* @param opts - Partial options object containing the values to update
|
|
30
|
+
* @remarks
|
|
31
|
+
* This method will merge the provided options with existing options and update all active streams.
|
|
32
|
+
* Only the properties specified in opts will be updated, other properties retain their current values.
|
|
33
|
+
*/
|
|
34
|
+
updateOptions(opts: Partial<VADOptions>): void;
|
|
26
35
|
/**
|
|
27
36
|
* Load and initialize the Silero VAD model.
|
|
28
37
|
*
|
|
@@ -55,5 +64,13 @@ export declare class VAD extends baseVAD {
|
|
|
55
64
|
export declare class VADStream extends baseStream {
|
|
56
65
|
#private;
|
|
57
66
|
constructor(vad: VAD, opts: VADOptions, model: OnnxModel);
|
|
67
|
+
/**
|
|
68
|
+
* Update the VAD options
|
|
69
|
+
*
|
|
70
|
+
* @param opts - Partial options object containing the values to update
|
|
71
|
+
* @remarks
|
|
72
|
+
* This method allows you to update the VAD options after the VAD object has been created
|
|
73
|
+
*/
|
|
74
|
+
updateOptions(opts: Partial<VADOptions>): void;
|
|
58
75
|
}
|
|
59
76
|
//# sourceMappingURL=vad.d.ts.map
|
package/dist/vad.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":";AAGA,OAAO,EAGL,SAAS,IAAI,UAAU,EACvB,GAAG,IAAI,OAAO,EAGf,MAAM,iBAAiB,CAAC;AAEzB,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACzD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAuB,MAAM,iBAAiB,CAAC;AAIjE,MAAM,WAAW,UAAU;IACzB,6DAA6D;IAC7D,iBAAiB,EAAE,MAAM,CAAC;IAC1B,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uDAAuD;IACvD,iBAAiB,EAAE,MAAM,CAAC;IAC1B,sDAAsD;IACtD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,wEAAwE;IACxE,UAAU,EAAE,UAAU,CAAC;IACvB,yCAAyC;IACzC,QAAQ,EAAE,OAAO,CAAC;CACnB;AAYD,qBAAa,GAAI,SAAQ,OAAO;;
|
|
1
|
+
{"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":";AAGA,OAAO,EAGL,SAAS,IAAI,UAAU,EACvB,GAAG,IAAI,OAAO,EAGf,MAAM,iBAAiB,CAAC;AAEzB,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACzD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAuB,MAAM,iBAAiB,CAAC;AAIjE,MAAM,WAAW,UAAU;IACzB,6DAA6D;IAC7D,iBAAiB,EAAE,MAAM,CAAC;IAC1B,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uDAAuD;IACvD,iBAAiB,EAAE,MAAM,CAAC;IAC1B,sDAAsD;IACtD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,wEAAwE;IACxE,UAAU,EAAE,UAAU,CAAC;IACvB,yCAAyC;IACzC,QAAQ,EAAE,OAAO,CAAC;CACnB;AAYD,qBAAa,GAAI,SAAQ,OAAO;;IAI9B,KAAK,SAAgB;gBAET,OAAO,EAAE,gBAAgB,EAAE,IAAI,EAAE,UAAU;IAOvD;;;;;;;OAOG;IACH,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI;IAO9C;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;WACU,IAAI,CAAC,IAAI,GAAE,OAAO,CAAC,UAAU,CAAM,GAAG,OAAO,CAAC,GAAG,CAAC;IAM/D,MAAM,IAAI,SAAS;CASpB;AAED,qBAAa,SAAU,SAAQ,UAAU;;gBAY3B,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,SAAS;IAwPxD;;;;;;OAMG;IACH,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,UAAU,CAAC;CA2BxC"}
|
package/dist/vad.js
CHANGED
|
@@ -21,11 +21,27 @@ const defaultVADOptions = {
|
|
|
21
21
|
class VAD extends baseVAD {
|
|
22
22
|
#session;
|
|
23
23
|
#opts;
|
|
24
|
+
#streams;
|
|
24
25
|
label = "silero.VAD";
|
|
25
26
|
constructor(session, opts) {
|
|
26
27
|
super({ updateInterval: 32 });
|
|
27
28
|
this.#session = session;
|
|
28
29
|
this.#opts = opts;
|
|
30
|
+
this.#streams = [];
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Updates the VAD options with new values.
|
|
34
|
+
*
|
|
35
|
+
* @param opts - Partial options object containing the values to update
|
|
36
|
+
* @remarks
|
|
37
|
+
* This method will merge the provided options with existing options and update all active streams.
|
|
38
|
+
* Only the properties specified in opts will be updated, other properties retain their current values.
|
|
39
|
+
*/
|
|
40
|
+
updateOptions(opts) {
|
|
41
|
+
this.#opts = { ...this.#opts, ...opts };
|
|
42
|
+
for (const stream of this.#streams) {
|
|
43
|
+
stream.updateOptions(this.#opts);
|
|
44
|
+
}
|
|
29
45
|
}
|
|
30
46
|
/**
|
|
31
47
|
* Load and initialize the Silero VAD model.
|
|
@@ -59,12 +75,22 @@ class VAD extends baseVAD {
|
|
|
59
75
|
return new VAD(session, mergedOpts);
|
|
60
76
|
}
|
|
61
77
|
stream() {
|
|
62
|
-
|
|
78
|
+
const stream = new VADStream(
|
|
79
|
+
this,
|
|
80
|
+
this.#opts,
|
|
81
|
+
new OnnxModel(this.#session, this.#opts.sampleRate)
|
|
82
|
+
);
|
|
83
|
+
this.#streams.push(stream);
|
|
84
|
+
return stream;
|
|
63
85
|
}
|
|
64
86
|
}
|
|
65
87
|
class VADStream extends baseStream {
|
|
66
88
|
#opts;
|
|
67
89
|
#model;
|
|
90
|
+
#inputSampleRate;
|
|
91
|
+
#speechBuffer;
|
|
92
|
+
#speechBufferMaxReached;
|
|
93
|
+
#prefixPaddingSamples;
|
|
68
94
|
#task;
|
|
69
95
|
#expFilter = new ExpFilter(0.35);
|
|
70
96
|
#extraInferenceTime = 0;
|
|
@@ -73,18 +99,18 @@ class VADStream extends baseStream {
|
|
|
73
99
|
super(vad);
|
|
74
100
|
this.#opts = opts;
|
|
75
101
|
this.#model = model;
|
|
102
|
+
this.#inputSampleRate = 0;
|
|
103
|
+
this.#speechBuffer = null;
|
|
104
|
+
this.#speechBufferMaxReached = false;
|
|
105
|
+
this.#prefixPaddingSamples = 0;
|
|
76
106
|
this.#task = new Promise(async () => {
|
|
77
107
|
let inferenceData = new Float32Array(this.#model.windowSizeSamples);
|
|
78
|
-
let speechBuffer = null;
|
|
79
|
-
let speechBufferMaxReached = false;
|
|
80
108
|
let speechBufferIndex = 0;
|
|
81
109
|
let pubSpeaking = false;
|
|
82
110
|
let pubSpeechDuration = 0;
|
|
83
111
|
let pubSilenceDuration = 0;
|
|
84
112
|
let pubCurrentSample = 0;
|
|
85
113
|
let pubTimestamp = 0;
|
|
86
|
-
let pubSampleRate = 0;
|
|
87
|
-
let pubPrefixPaddingSamples = 0;
|
|
88
114
|
let speechThresholdDuration = 0;
|
|
89
115
|
let silenceThresholdDuration = 0;
|
|
90
116
|
let inputFrames = [];
|
|
@@ -95,24 +121,23 @@ class VADStream extends baseStream {
|
|
|
95
121
|
if (typeof frame === "symbol") {
|
|
96
122
|
continue;
|
|
97
123
|
}
|
|
98
|
-
if (!
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
this.#opts.prefixPaddingDuration *
|
|
102
|
-
);
|
|
103
|
-
speechBuffer = new Int16Array(
|
|
104
|
-
this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples
|
|
124
|
+
if (!this.#inputSampleRate || !this.#speechBuffer) {
|
|
125
|
+
this.#inputSampleRate = frame.sampleRate;
|
|
126
|
+
this.#prefixPaddingSamples = Math.trunc(
|
|
127
|
+
this.#opts.prefixPaddingDuration * this.#inputSampleRate / 1e3
|
|
105
128
|
);
|
|
106
|
-
|
|
129
|
+
const bufferSize = Math.trunc(this.#opts.maxBufferedSpeech * this.#inputSampleRate / 1e3) + this.#prefixPaddingSamples;
|
|
130
|
+
this.#speechBuffer = new Int16Array(bufferSize);
|
|
131
|
+
if (this.#opts.sampleRate !== this.#inputSampleRate) {
|
|
107
132
|
resampler = new AudioResampler(
|
|
108
|
-
|
|
133
|
+
this.#inputSampleRate,
|
|
109
134
|
this.#opts.sampleRate,
|
|
110
135
|
1,
|
|
111
136
|
AudioResamplerQuality.QUICK
|
|
112
137
|
// VAD doesn't need high quality
|
|
113
138
|
);
|
|
114
139
|
}
|
|
115
|
-
} else if (frame.sampleRate !==
|
|
140
|
+
} else if (frame.sampleRate !== this.#inputSampleRate) {
|
|
116
141
|
this.#logger.error("a frame with a different sample rate was already published");
|
|
117
142
|
continue;
|
|
118
143
|
}
|
|
@@ -138,17 +163,17 @@ class VADStream extends baseStream {
|
|
|
138
163
|
const windowDuration = this.#model.windowSizeSamples / this.#opts.sampleRate * 1e3;
|
|
139
164
|
pubCurrentSample += this.#model.windowSizeSamples;
|
|
140
165
|
pubTimestamp += windowDuration;
|
|
141
|
-
const resamplingRatio =
|
|
166
|
+
const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;
|
|
142
167
|
const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
|
|
143
168
|
const toCopyInt = Math.trunc(toCopy);
|
|
144
169
|
inputCopyRemainingFrac = toCopy - toCopyInt;
|
|
145
|
-
const availableSpace = speechBuffer.length - speechBufferIndex;
|
|
170
|
+
const availableSpace = this.#speechBuffer.length - speechBufferIndex;
|
|
146
171
|
const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
|
|
147
172
|
if (toCopyBuffer > 0) {
|
|
148
|
-
speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
173
|
+
this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
149
174
|
speechBufferIndex += toCopyBuffer;
|
|
150
|
-
} else if (!speechBufferMaxReached) {
|
|
151
|
-
speechBufferMaxReached = true;
|
|
175
|
+
} else if (!this.#speechBufferMaxReached) {
|
|
176
|
+
this.#speechBufferMaxReached = true;
|
|
152
177
|
this.#logger.warn(
|
|
153
178
|
"maxBufferedSpeech reached, ignoring further data for the current speech input"
|
|
154
179
|
);
|
|
@@ -175,30 +200,35 @@ class VADStream extends baseStream {
|
|
|
175
200
|
probability: p,
|
|
176
201
|
inferenceDuration,
|
|
177
202
|
frames: [
|
|
178
|
-
new AudioFrame(
|
|
203
|
+
new AudioFrame(
|
|
204
|
+
inputFrame.data.subarray(0, toCopyInt),
|
|
205
|
+
this.#inputSampleRate,
|
|
206
|
+
1,
|
|
207
|
+
toCopyInt
|
|
208
|
+
)
|
|
179
209
|
],
|
|
180
210
|
speaking: pubSpeaking,
|
|
181
211
|
rawAccumulatedSilence: silenceThresholdDuration,
|
|
182
212
|
rawAccumulatedSpeech: speechThresholdDuration
|
|
183
213
|
});
|
|
184
214
|
const resetWriteCursor = () => {
|
|
185
|
-
if (!speechBuffer) throw new Error("speechBuffer is empty");
|
|
186
|
-
if (speechBufferIndex <=
|
|
215
|
+
if (!this.#speechBuffer) throw new Error("speechBuffer is empty");
|
|
216
|
+
if (speechBufferIndex <= this.#prefixPaddingSamples) {
|
|
187
217
|
return;
|
|
188
218
|
}
|
|
189
|
-
const paddingData = speechBuffer.subarray(
|
|
190
|
-
speechBufferIndex -
|
|
219
|
+
const paddingData = this.#speechBuffer.subarray(
|
|
220
|
+
speechBufferIndex - this.#prefixPaddingSamples,
|
|
191
221
|
speechBufferIndex
|
|
192
222
|
);
|
|
193
|
-
speechBuffer.set(paddingData, 0);
|
|
194
|
-
speechBufferIndex =
|
|
195
|
-
speechBufferMaxReached = false;
|
|
223
|
+
this.#speechBuffer.set(paddingData, 0);
|
|
224
|
+
speechBufferIndex = this.#prefixPaddingSamples;
|
|
225
|
+
this.#speechBufferMaxReached = false;
|
|
196
226
|
};
|
|
197
227
|
const copySpeechBuffer = () => {
|
|
198
|
-
if (!speechBuffer) throw new Error("speechBuffer is empty");
|
|
228
|
+
if (!this.#speechBuffer) throw new Error("speechBuffer is empty");
|
|
199
229
|
return new AudioFrame(
|
|
200
|
-
speechBuffer.subarray(0, speechBufferIndex),
|
|
201
|
-
|
|
230
|
+
this.#speechBuffer.subarray(0, speechBufferIndex),
|
|
231
|
+
this.#inputSampleRate,
|
|
202
232
|
1,
|
|
203
233
|
speechBufferIndex
|
|
204
234
|
);
|
|
@@ -254,7 +284,9 @@ class VADStream extends baseStream {
|
|
|
254
284
|
inferenceFrames = [];
|
|
255
285
|
if (inputFrame.data.length > toCopyInt) {
|
|
256
286
|
const data = inputFrame.data.subarray(toCopyInt);
|
|
257
|
-
inputFrames.push(
|
|
287
|
+
inputFrames.push(
|
|
288
|
+
new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2))
|
|
289
|
+
);
|
|
258
290
|
}
|
|
259
291
|
if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
|
|
260
292
|
const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
|
|
@@ -266,6 +298,32 @@ class VADStream extends baseStream {
|
|
|
266
298
|
}
|
|
267
299
|
});
|
|
268
300
|
}
|
|
301
|
+
/**
|
|
302
|
+
* Update the VAD options
|
|
303
|
+
*
|
|
304
|
+
* @param opts - Partial options object containing the values to update
|
|
305
|
+
* @remarks
|
|
306
|
+
* This method allows you to update the VAD options after the VAD object has been created
|
|
307
|
+
*/
|
|
308
|
+
updateOptions(opts) {
|
|
309
|
+
const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;
|
|
310
|
+
this.#opts = { ...this.#opts, ...opts };
|
|
311
|
+
if (this.#inputSampleRate) {
|
|
312
|
+
if (this.#speechBuffer === null) throw new Error("speechBuffer is null");
|
|
313
|
+
this.#prefixPaddingSamples = Math.trunc(
|
|
314
|
+
this.#opts.prefixPaddingDuration * this.#inputSampleRate / 1e3
|
|
315
|
+
);
|
|
316
|
+
const bufferSize = Math.trunc(this.#opts.maxBufferedSpeech * this.#inputSampleRate / 1e3) + this.#prefixPaddingSamples;
|
|
317
|
+
const resizedBuffer = new Int16Array(bufferSize);
|
|
318
|
+
resizedBuffer.set(
|
|
319
|
+
this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize))
|
|
320
|
+
);
|
|
321
|
+
this.#speechBuffer = resizedBuffer;
|
|
322
|
+
if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {
|
|
323
|
+
this.#speechBufferMaxReached = false;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
269
327
|
}
|
|
270
328
|
export {
|
|
271
329
|
VAD,
|
package/dist/vad.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n return new VADStream(this, this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBuffer: Int16Array | null = null;\n let speechBufferMaxReached = false;\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let pubSampleRate = 0;\n let pubPrefixPaddingSamples = 0; // size in samples of padding data\n\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!pubSampleRate || !speechBuffer) {\n pubSampleRate = frame.sampleRate;\n pubPrefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * pubSampleRate) / 1000,\n );\n\n speechBuffer = new Int16Array(\n this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples,\n );\n\n if (this.#opts.sampleRate !== pubSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n pubSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== pubSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = pubSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!speechBufferMaxReached) {\n speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= pubPrefixPaddingSamples) {\n return;\n }\n\n const paddingData = speechBuffer.subarray(\n speechBufferIndex - pubPrefixPaddingSamples,\n speechBufferIndex,\n );\n speechBuffer.set(paddingData, 0);\n speechBufferIndex = pubPrefixPaddingSamples;\n speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n speechBuffer.subarray(0, speechBufferIndex),\n pubSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n}\n"],"mappings":"AAGA;AAAA,EACE;AAAA,EACA;AAAA,EACA,aAAa;AAAA,EACb,OAAO;AAAA,EACP;AAAA,EACA;AAAA,OACK;AACP,SAAS,YAAY,gBAAgB,6BAA6B;AAGlE,SAAS,WAAW,2BAA2B;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,QAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AAAA,EACf;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,MAAM,oBAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,WAAO,IAAI,UAAU,MAAM,KAAK,OAAO,IAAI,UAAU,KAAK,UAAU,KAAK,MAAM,UAAU,CAAC;AAAA,EAC5F;AACF;AAEO,MAAM,kBAAkB,WAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,UAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,UAAU,IAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAEd,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,eAAkC;AACtC,UAAI,yBAAyB;AAC7B,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,gBAAgB;AACpB,UAAI,0BAA0B;AAE9B,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,iBAAiB,CAAC,cAAc;AACnC,0BAAgB,MAAM;AACtB,oCAA0B,KAAK;AAAA,YAC5B,KAAK,MAAM,wBAAwB,gBAAiB;AAAA,UACvD;AAEA,yBAAe,IAAI;AAAA,YACjB,KAAK,MAAM,oBAAoB,gBAAgB;AAAA,UACjD;AAEA,cAAI,KAAK,MAAM,eAAe,eAAe;AAG3C,wBAAY,IAAI;AAAA,cACd;AAAA,cACA,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sBAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,eAAe;AAC7C,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,aAAa,YAAY,WAAW;AAC1C,gBAAM,iBAAiB,YAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,gBAAgB,KAAK,OAAO;AACpD,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,aAAa,SAAS;AAC7C,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,yBAAa,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AAC7E,iCAAqB;AAAA,UACvB,WAAW,CAAC,wBAAwB;AAClC,qCAAyB;AACzB,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,aAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI,WAAW,WAAW,KAAK,SAAS,GAAG,SAAS,GAAG,eAAe,GAAG,SAAS;AAAA,YACpF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,gBAAI,qBAAqB,yBAAyB;AAChD;AAAA,YACF;AAEA,kBAAM,cAAc,aAAa;AAAA,cAC/B,oBAAoB;AAAA,cACpB;AAAA,YACF;AACA,yBAAa,IAAI,aAAa,CAAC;AAC/B,gCAAoB;AACpB,qCAAyB;AAAA,UAC3B;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,mBAAO,IAAI;AAAA,cACT,aAAa,SAAS,GAAG,iBAAiB;AAAA,cAC1C;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY,KAAK,IAAI,WAAW,MAAM,eAAe,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC;AAAA,UACtF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,WAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AACF;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n #streams: VADStream[];\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n this.#streams = [];\n }\n\n /**\n * Updates the VAD options with new values.\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method will merge the provided options with existing options and update all active streams.\n * Only the properties specified in opts will be updated, other properties retain their current values.\n */\n updateOptions(opts: Partial<VADOptions>): void {\n this.#opts = { ...this.#opts, ...opts };\n for (const stream of this.#streams) {\n stream.updateOptions(this.#opts);\n }\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n const stream = new VADStream(\n this,\n this.#opts,\n new OnnxModel(this.#session, this.#opts.sampleRate),\n );\n this.#streams.push(stream);\n return stream;\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #inputSampleRate: number;\n #speechBuffer: Int16Array | null;\n #speechBufferMaxReached: boolean;\n #prefixPaddingSamples: number;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n this.#inputSampleRate = 0;\n this.#speechBuffer = null;\n this.#speechBufferMaxReached = false;\n this.#prefixPaddingSamples = 0;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!this.#inputSampleRate || !this.#speechBuffer) {\n this.#inputSampleRate = frame.sampleRate;\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n this.#speechBuffer = new Int16Array(bufferSize);\n\n if (this.#opts.sampleRate !== this.#inputSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n this.#inputSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== this.#inputSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = this.#speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!this.#speechBufferMaxReached) {\n this.#speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(\n inputFrame.data.subarray(0, toCopyInt),\n this.#inputSampleRate,\n 1,\n toCopyInt,\n ),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= this.#prefixPaddingSamples) {\n return;\n }\n\n const paddingData = this.#speechBuffer.subarray(\n speechBufferIndex - this.#prefixPaddingSamples,\n speechBufferIndex,\n );\n this.#speechBuffer.set(paddingData, 0);\n speechBufferIndex = this.#prefixPaddingSamples;\n this.#speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n this.#speechBuffer.subarray(0, speechBufferIndex),\n this.#inputSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(\n new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n\n /**\n * Update the VAD options\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method allows you to update the VAD options after the VAD object has been created\n */\n updateOptions(opts: Partial<VADOptions>) {\n const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;\n this.#opts = { ...this.#opts, ...opts };\n\n if (this.#inputSampleRate) {\n // Assert speech buffer exists\n if (this.#speechBuffer === null) throw new Error('speechBuffer is null');\n\n // Resize speech buffer\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n const resizedBuffer = new Int16Array(bufferSize);\n resizedBuffer.set(\n this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),\n );\n this.#speechBuffer = resizedBuffer;\n\n // Determine if max has been reached\n if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {\n this.#speechBufferMaxReached = false;\n }\n }\n }\n}\n"],"mappings":"AAGA;AAAA,EACE;AAAA,EACA;AAAA,EACA,aAAa;AAAA,EACb,OAAO;AAAA,EACP;AAAA,EACA;AAAA,OACK;AACP,SAAS,YAAY,gBAAgB,6BAA6B;AAGlE,SAAS,WAAW,2BAA2B;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,QAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AACb,SAAK,WAAW,CAAC;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,cAAc,MAAiC;AAC7C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AACtC,eAAW,UAAU,KAAK,UAAU;AAClC,aAAO,cAAc,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,MAAM,oBAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA,KAAK;AAAA,MACL,IAAI,UAAU,KAAK,UAAU,KAAK,MAAM,UAAU;AAAA,IACpD;AACA,SAAK,SAAS,KAAK,MAAM;AACzB,WAAO;AAAA,EACT;AACF;AAEO,MAAM,kBAAkB,WAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,UAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,UAAU,IAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AACd,SAAK,mBAAmB;AACxB,SAAK,gBAAgB;AACrB,SAAK,0BAA0B;AAC/B,SAAK,wBAAwB;AAE7B,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,KAAK,oBAAoB,CAAC,KAAK,eAAe;AACjD,eAAK,mBAAmB,MAAM;AAC9B,eAAK,wBAAwB,KAAK;AAAA,YAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,UAC/D;AACA,gBAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,eAAK,gBAAgB,IAAI,WAAW,UAAU;AAE9C,cAAI,KAAK,MAAM,eAAe,KAAK,kBAAkB;AAGnD,wBAAY,IAAI;AAAA,cACd,KAAK;AAAA,cACL,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sBAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,KAAK,kBAAkB;AACrD,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,aAAa,YAAY,WAAW;AAC1C,gBAAM,iBAAiB,YAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,KAAK,mBAAmB,KAAK,OAAO;AAC5D,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,KAAK,cAAc,SAAS;AACnD,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,iBAAK,cAAc,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AACnF,iCAAqB;AAAA,UACvB,WAAW,CAAC,KAAK,yBAAyB;AACxC,iBAAK,0BAA0B;AAC/B,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,aAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI;AAAA,gBACF,WAAW,KAAK,SAAS,GAAG,SAAS;AAAA,gBACrC,KAAK;AAAA,gBACL;AAAA,gBACA;AAAA,cACF;AAAA,YACF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,gBAAI,qBAAqB,KAAK,uBAAuB;AACnD;AAAA,YACF;AAEA,kBAAM,cAAc,KAAK,cAAc;AAAA,cACrC,oBAAoB,KAAK;AAAA,cACzB;AAAA,YACF;AACA,iBAAK,cAAc,IAAI,aAAa,CAAC;AACrC,gCAAoB,KAAK;AACzB,iBAAK,0BAA0B;AAAA,UACjC;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,mBAAO,IAAI;AAAA,cACT,KAAK,cAAc,SAAS,GAAG,iBAAiB;AAAA,cAChD,KAAK;AAAA,cACL;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY;AAAA,cACV,IAAI,WAAW,MAAM,KAAK,kBAAkB,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,WAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,cAAc,MAA2B;AACvC,UAAM,uBAAuB,KAAK,MAAM;AACxC,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAEtC,QAAI,KAAK,kBAAkB;AAEzB,UAAI,KAAK,kBAAkB,KAAM,OAAM,IAAI,MAAM,sBAAsB;AAGvE,WAAK,wBAAwB,KAAK;AAAA,QAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,MAC/D;AACA,YAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,YAAM,gBAAgB,IAAI,WAAW,UAAU;AAC/C,oBAAc;AAAA,QACZ,KAAK,cAAc,SAAS,GAAG,KAAK,IAAI,KAAK,cAAc,QAAQ,UAAU,CAAC;AAAA,MAChF;AACA,WAAK,gBAAgB;AAGrB,UAAI,KAAK,MAAM,oBAAoB,sBAAsB;AACvD,aAAK,0BAA0B;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;","names":[]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-silero",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.4",
|
|
4
4
|
"description": "Silero voice activity detection LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"require": "dist/index.cjs",
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
],
|
|
24
24
|
"devDependencies": {
|
|
25
25
|
"@livekit/agents": "^x",
|
|
26
|
-
"@livekit/rtc-node": "^0.13.
|
|
26
|
+
"@livekit/rtc-node": "^0.13.4",
|
|
27
27
|
"@microsoft/api-extractor": "^7.35.0",
|
|
28
28
|
"@types/ws": "^8.5.10",
|
|
29
29
|
"onnxruntime-common": "^1.19.2",
|
|
@@ -35,8 +35,8 @@
|
|
|
35
35
|
"ws": "^8.16.0"
|
|
36
36
|
},
|
|
37
37
|
"peerDependencies": {
|
|
38
|
-
"@livekit/rtc-node": "^0.13.
|
|
39
|
-
"@livekit/agents": "^0.
|
|
38
|
+
"@livekit/rtc-node": "^0.13.4",
|
|
39
|
+
"@livekit/agents": "^0.7.0x"
|
|
40
40
|
},
|
|
41
41
|
"scripts": {
|
|
42
42
|
"build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\" && cp src/silero_vad.onnx dist/",
|
package/src/vad.ts
CHANGED
|
@@ -46,12 +46,29 @@ const defaultVADOptions: VADOptions = {
|
|
|
46
46
|
export class VAD extends baseVAD {
|
|
47
47
|
#session: InferenceSession;
|
|
48
48
|
#opts: VADOptions;
|
|
49
|
+
#streams: VADStream[];
|
|
49
50
|
label = 'silero.VAD';
|
|
50
51
|
|
|
51
52
|
constructor(session: InferenceSession, opts: VADOptions) {
|
|
52
53
|
super({ updateInterval: 32 });
|
|
53
54
|
this.#session = session;
|
|
54
55
|
this.#opts = opts;
|
|
56
|
+
this.#streams = [];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Updates the VAD options with new values.
|
|
61
|
+
*
|
|
62
|
+
* @param opts - Partial options object containing the values to update
|
|
63
|
+
* @remarks
|
|
64
|
+
* This method will merge the provided options with existing options and update all active streams.
|
|
65
|
+
* Only the properties specified in opts will be updated, other properties retain their current values.
|
|
66
|
+
*/
|
|
67
|
+
updateOptions(opts: Partial<VADOptions>): void {
|
|
68
|
+
this.#opts = { ...this.#opts, ...opts };
|
|
69
|
+
for (const stream of this.#streams) {
|
|
70
|
+
stream.updateOptions(this.#opts);
|
|
71
|
+
}
|
|
55
72
|
}
|
|
56
73
|
|
|
57
74
|
/**
|
|
@@ -87,13 +104,23 @@ export class VAD extends baseVAD {
|
|
|
87
104
|
}
|
|
88
105
|
|
|
89
106
|
stream(): VADStream {
|
|
90
|
-
|
|
107
|
+
const stream = new VADStream(
|
|
108
|
+
this,
|
|
109
|
+
this.#opts,
|
|
110
|
+
new OnnxModel(this.#session, this.#opts.sampleRate),
|
|
111
|
+
);
|
|
112
|
+
this.#streams.push(stream);
|
|
113
|
+
return stream;
|
|
91
114
|
}
|
|
92
115
|
}
|
|
93
116
|
|
|
94
117
|
export class VADStream extends baseStream {
|
|
95
118
|
#opts: VADOptions;
|
|
96
119
|
#model: OnnxModel;
|
|
120
|
+
#inputSampleRate: number;
|
|
121
|
+
#speechBuffer: Int16Array | null;
|
|
122
|
+
#speechBufferMaxReached: boolean;
|
|
123
|
+
#prefixPaddingSamples: number;
|
|
97
124
|
#task: Promise<void>;
|
|
98
125
|
#expFilter = new ExpFilter(0.35);
|
|
99
126
|
#extraInferenceTime = 0;
|
|
@@ -103,13 +130,15 @@ export class VADStream extends baseStream {
|
|
|
103
130
|
super(vad);
|
|
104
131
|
this.#opts = opts;
|
|
105
132
|
this.#model = model;
|
|
133
|
+
this.#inputSampleRate = 0;
|
|
134
|
+
this.#speechBuffer = null;
|
|
135
|
+
this.#speechBufferMaxReached = false;
|
|
136
|
+
this.#prefixPaddingSamples = 0;
|
|
106
137
|
|
|
107
138
|
this.#task = new Promise(async () => {
|
|
108
139
|
let inferenceData = new Float32Array(this.#model.windowSizeSamples);
|
|
109
140
|
|
|
110
141
|
// a copy is exposed to the user in END_OF_SPEECH
|
|
111
|
-
let speechBuffer: Int16Array | null = null;
|
|
112
|
-
let speechBufferMaxReached = false;
|
|
113
142
|
let speechBufferIndex = 0;
|
|
114
143
|
|
|
115
144
|
// "pub" means public, these values are exposed to the users through events
|
|
@@ -118,9 +147,6 @@ export class VADStream extends baseStream {
|
|
|
118
147
|
let pubSilenceDuration = 0;
|
|
119
148
|
let pubCurrentSample = 0;
|
|
120
149
|
let pubTimestamp = 0;
|
|
121
|
-
let pubSampleRate = 0;
|
|
122
|
-
let pubPrefixPaddingSamples = 0; // size in samples of padding data
|
|
123
|
-
|
|
124
150
|
let speechThresholdDuration = 0;
|
|
125
151
|
let silenceThresholdDuration = 0;
|
|
126
152
|
|
|
@@ -136,27 +162,27 @@ export class VADStream extends baseStream {
|
|
|
136
162
|
continue; // ignore flush sentinel for now
|
|
137
163
|
}
|
|
138
164
|
|
|
139
|
-
if (!
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
(this.#opts.prefixPaddingDuration *
|
|
143
|
-
);
|
|
144
|
-
|
|
145
|
-
speechBuffer = new Int16Array(
|
|
146
|
-
this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples,
|
|
165
|
+
if (!this.#inputSampleRate || !this.#speechBuffer) {
|
|
166
|
+
this.#inputSampleRate = frame.sampleRate;
|
|
167
|
+
this.#prefixPaddingSamples = Math.trunc(
|
|
168
|
+
(this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,
|
|
147
169
|
);
|
|
170
|
+
const bufferSize =
|
|
171
|
+
Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +
|
|
172
|
+
this.#prefixPaddingSamples;
|
|
173
|
+
this.#speechBuffer = new Int16Array(bufferSize);
|
|
148
174
|
|
|
149
|
-
if (this.#opts.sampleRate !==
|
|
175
|
+
if (this.#opts.sampleRate !== this.#inputSampleRate) {
|
|
150
176
|
// resampling needed: the input sample rate isn't the same as the model's
|
|
151
177
|
// sample rate used for inference
|
|
152
178
|
resampler = new AudioResampler(
|
|
153
|
-
|
|
179
|
+
this.#inputSampleRate,
|
|
154
180
|
this.#opts.sampleRate,
|
|
155
181
|
1,
|
|
156
182
|
AudioResamplerQuality.QUICK, // VAD doesn't need high quality
|
|
157
183
|
);
|
|
158
184
|
}
|
|
159
|
-
} else if (frame.sampleRate !==
|
|
185
|
+
} else if (frame.sampleRate !== this.#inputSampleRate) {
|
|
160
186
|
this.#logger.error('a frame with a different sample rate was already published');
|
|
161
187
|
continue;
|
|
162
188
|
}
|
|
@@ -194,19 +220,19 @@ export class VADStream extends baseStream {
|
|
|
194
220
|
const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;
|
|
195
221
|
pubCurrentSample += this.#model.windowSizeSamples;
|
|
196
222
|
pubTimestamp += windowDuration;
|
|
197
|
-
const resamplingRatio =
|
|
223
|
+
const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;
|
|
198
224
|
const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
|
|
199
225
|
const toCopyInt = Math.trunc(toCopy);
|
|
200
226
|
inputCopyRemainingFrac = toCopy - toCopyInt;
|
|
201
227
|
|
|
202
228
|
// copy the inference window to the speech buffer
|
|
203
|
-
const availableSpace = speechBuffer.length - speechBufferIndex;
|
|
229
|
+
const availableSpace = this.#speechBuffer.length - speechBufferIndex;
|
|
204
230
|
const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
|
|
205
231
|
if (toCopyBuffer > 0) {
|
|
206
|
-
speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
232
|
+
this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
207
233
|
speechBufferIndex += toCopyBuffer;
|
|
208
|
-
} else if (!speechBufferMaxReached) {
|
|
209
|
-
speechBufferMaxReached = true;
|
|
234
|
+
} else if (!this.#speechBufferMaxReached) {
|
|
235
|
+
this.#speechBufferMaxReached = true;
|
|
210
236
|
this.#logger.warn(
|
|
211
237
|
'maxBufferedSpeech reached, ignoring further data for the current speech input',
|
|
212
238
|
);
|
|
@@ -238,7 +264,12 @@ export class VADStream extends baseStream {
|
|
|
238
264
|
probability: p,
|
|
239
265
|
inferenceDuration,
|
|
240
266
|
frames: [
|
|
241
|
-
new AudioFrame(
|
|
267
|
+
new AudioFrame(
|
|
268
|
+
inputFrame.data.subarray(0, toCopyInt),
|
|
269
|
+
this.#inputSampleRate,
|
|
270
|
+
1,
|
|
271
|
+
toCopyInt,
|
|
272
|
+
),
|
|
242
273
|
],
|
|
243
274
|
speaking: pubSpeaking,
|
|
244
275
|
rawAccumulatedSilence: silenceThresholdDuration,
|
|
@@ -246,25 +277,25 @@ export class VADStream extends baseStream {
|
|
|
246
277
|
});
|
|
247
278
|
|
|
248
279
|
const resetWriteCursor = () => {
|
|
249
|
-
if (!speechBuffer) throw new Error('speechBuffer is empty');
|
|
250
|
-
if (speechBufferIndex <=
|
|
280
|
+
if (!this.#speechBuffer) throw new Error('speechBuffer is empty');
|
|
281
|
+
if (speechBufferIndex <= this.#prefixPaddingSamples) {
|
|
251
282
|
return;
|
|
252
283
|
}
|
|
253
284
|
|
|
254
|
-
const paddingData = speechBuffer.subarray(
|
|
255
|
-
speechBufferIndex -
|
|
285
|
+
const paddingData = this.#speechBuffer.subarray(
|
|
286
|
+
speechBufferIndex - this.#prefixPaddingSamples,
|
|
256
287
|
speechBufferIndex,
|
|
257
288
|
);
|
|
258
|
-
speechBuffer.set(paddingData, 0);
|
|
259
|
-
speechBufferIndex =
|
|
260
|
-
speechBufferMaxReached = false;
|
|
289
|
+
this.#speechBuffer.set(paddingData, 0);
|
|
290
|
+
speechBufferIndex = this.#prefixPaddingSamples;
|
|
291
|
+
this.#speechBufferMaxReached = false;
|
|
261
292
|
};
|
|
262
293
|
|
|
263
294
|
const copySpeechBuffer = (): AudioFrame => {
|
|
264
|
-
if (!speechBuffer) throw new Error('speechBuffer is empty');
|
|
295
|
+
if (!this.#speechBuffer) throw new Error('speechBuffer is empty');
|
|
265
296
|
return new AudioFrame(
|
|
266
|
-
speechBuffer.subarray(0, speechBufferIndex),
|
|
267
|
-
|
|
297
|
+
this.#speechBuffer.subarray(0, speechBufferIndex),
|
|
298
|
+
this.#inputSampleRate,
|
|
268
299
|
1,
|
|
269
300
|
speechBufferIndex,
|
|
270
301
|
);
|
|
@@ -328,7 +359,9 @@ export class VADStream extends baseStream {
|
|
|
328
359
|
|
|
329
360
|
if (inputFrame.data.length > toCopyInt) {
|
|
330
361
|
const data = inputFrame.data.subarray(toCopyInt);
|
|
331
|
-
inputFrames.push(
|
|
362
|
+
inputFrames.push(
|
|
363
|
+
new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),
|
|
364
|
+
);
|
|
332
365
|
}
|
|
333
366
|
if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
|
|
334
367
|
const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
|
|
@@ -340,4 +373,39 @@ export class VADStream extends baseStream {
|
|
|
340
373
|
}
|
|
341
374
|
});
|
|
342
375
|
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Update the VAD options
|
|
379
|
+
*
|
|
380
|
+
* @param opts - Partial options object containing the values to update
|
|
381
|
+
* @remarks
|
|
382
|
+
* This method allows you to update the VAD options after the VAD object has been created
|
|
383
|
+
*/
|
|
384
|
+
updateOptions(opts: Partial<VADOptions>) {
|
|
385
|
+
const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;
|
|
386
|
+
this.#opts = { ...this.#opts, ...opts };
|
|
387
|
+
|
|
388
|
+
if (this.#inputSampleRate) {
|
|
389
|
+
// Assert speech buffer exists
|
|
390
|
+
if (this.#speechBuffer === null) throw new Error('speechBuffer is null');
|
|
391
|
+
|
|
392
|
+
// Resize speech buffer
|
|
393
|
+
this.#prefixPaddingSamples = Math.trunc(
|
|
394
|
+
(this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,
|
|
395
|
+
);
|
|
396
|
+
const bufferSize =
|
|
397
|
+
Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +
|
|
398
|
+
this.#prefixPaddingSamples;
|
|
399
|
+
const resizedBuffer = new Int16Array(bufferSize);
|
|
400
|
+
resizedBuffer.set(
|
|
401
|
+
this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),
|
|
402
|
+
);
|
|
403
|
+
this.#speechBuffer = resizedBuffer;
|
|
404
|
+
|
|
405
|
+
// Determine if max has been reached
|
|
406
|
+
if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {
|
|
407
|
+
this.#speechBufferMaxReached = false;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
}
|
|
343
411
|
}
|