@livekit/agents-plugin-silero 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/vad.cjs CHANGED
@@ -38,11 +38,27 @@ const defaultVADOptions = {
38
38
  class VAD extends import_agents.VAD {
39
39
  #session;
40
40
  #opts;
41
+ #streams;
41
42
  label = "silero.VAD";
42
43
  constructor(session, opts) {
43
44
  super({ updateInterval: 32 });
44
45
  this.#session = session;
45
46
  this.#opts = opts;
47
+ this.#streams = [];
48
+ }
49
+ /**
50
+ * Updates the VAD options with new values.
51
+ *
52
+ * @param opts - Partial options object containing the values to update
53
+ * @remarks
54
+ * This method will merge the provided options with existing options and update all active streams.
55
+ * Only the properties specified in opts will be updated, other properties retain their current values.
56
+ */
57
+ updateOptions(opts) {
58
+ this.#opts = { ...this.#opts, ...opts };
59
+ for (const stream of this.#streams) {
60
+ stream.updateOptions(this.#opts);
61
+ }
46
62
  }
47
63
  /**
48
64
  * Load and initialize the Silero VAD model.
@@ -76,12 +92,22 @@ class VAD extends import_agents.VAD {
76
92
  return new VAD(session, mergedOpts);
77
93
  }
78
94
  stream() {
79
- return new VADStream(this, this.#opts, new import_onnx_model.OnnxModel(this.#session, this.#opts.sampleRate));
95
+ const stream = new VADStream(
96
+ this,
97
+ this.#opts,
98
+ new import_onnx_model.OnnxModel(this.#session, this.#opts.sampleRate)
99
+ );
100
+ this.#streams.push(stream);
101
+ return stream;
80
102
  }
81
103
  }
82
104
  class VADStream extends import_agents.VADStream {
83
105
  #opts;
84
106
  #model;
107
+ #inputSampleRate;
108
+ #speechBuffer;
109
+ #speechBufferMaxReached;
110
+ #prefixPaddingSamples;
85
111
  #task;
86
112
  #expFilter = new import_agents.ExpFilter(0.35);
87
113
  #extraInferenceTime = 0;
@@ -90,18 +116,18 @@ class VADStream extends import_agents.VADStream {
90
116
  super(vad);
91
117
  this.#opts = opts;
92
118
  this.#model = model;
119
+ this.#inputSampleRate = 0;
120
+ this.#speechBuffer = null;
121
+ this.#speechBufferMaxReached = false;
122
+ this.#prefixPaddingSamples = 0;
93
123
  this.#task = new Promise(async () => {
94
124
  let inferenceData = new Float32Array(this.#model.windowSizeSamples);
95
- let speechBuffer = null;
96
- let speechBufferMaxReached = false;
97
125
  let speechBufferIndex = 0;
98
126
  let pubSpeaking = false;
99
127
  let pubSpeechDuration = 0;
100
128
  let pubSilenceDuration = 0;
101
129
  let pubCurrentSample = 0;
102
130
  let pubTimestamp = 0;
103
- let pubSampleRate = 0;
104
- let pubPrefixPaddingSamples = 0;
105
131
  let speechThresholdDuration = 0;
106
132
  let silenceThresholdDuration = 0;
107
133
  let inputFrames = [];
@@ -112,24 +138,23 @@ class VADStream extends import_agents.VADStream {
112
138
  if (typeof frame === "symbol") {
113
139
  continue;
114
140
  }
115
- if (!pubSampleRate || !speechBuffer) {
116
- pubSampleRate = frame.sampleRate;
117
- pubPrefixPaddingSamples = Math.trunc(
118
- this.#opts.prefixPaddingDuration * pubSampleRate / 1e3
119
- );
120
- speechBuffer = new Int16Array(
121
- this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples
141
+ if (!this.#inputSampleRate || !this.#speechBuffer) {
142
+ this.#inputSampleRate = frame.sampleRate;
143
+ this.#prefixPaddingSamples = Math.trunc(
144
+ this.#opts.prefixPaddingDuration * this.#inputSampleRate / 1e3
122
145
  );
123
- if (this.#opts.sampleRate !== pubSampleRate) {
146
+ const bufferSize = Math.trunc(this.#opts.maxBufferedSpeech * this.#inputSampleRate / 1e3) + this.#prefixPaddingSamples;
147
+ this.#speechBuffer = new Int16Array(bufferSize);
148
+ if (this.#opts.sampleRate !== this.#inputSampleRate) {
124
149
  resampler = new import_rtc_node.AudioResampler(
125
- pubSampleRate,
150
+ this.#inputSampleRate,
126
151
  this.#opts.sampleRate,
127
152
  1,
128
153
  import_rtc_node.AudioResamplerQuality.QUICK
129
154
  // VAD doesn't need high quality
130
155
  );
131
156
  }
132
- } else if (frame.sampleRate !== pubSampleRate) {
157
+ } else if (frame.sampleRate !== this.#inputSampleRate) {
133
158
  this.#logger.error("a frame with a different sample rate was already published");
134
159
  continue;
135
160
  }
@@ -155,17 +180,17 @@ class VADStream extends import_agents.VADStream {
155
180
  const windowDuration = this.#model.windowSizeSamples / this.#opts.sampleRate * 1e3;
156
181
  pubCurrentSample += this.#model.windowSizeSamples;
157
182
  pubTimestamp += windowDuration;
158
- const resamplingRatio = pubSampleRate / this.#model.sampleRate;
183
+ const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;
159
184
  const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
160
185
  const toCopyInt = Math.trunc(toCopy);
161
186
  inputCopyRemainingFrac = toCopy - toCopyInt;
162
- const availableSpace = speechBuffer.length - speechBufferIndex;
187
+ const availableSpace = this.#speechBuffer.length - speechBufferIndex;
163
188
  const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
164
189
  if (toCopyBuffer > 0) {
165
- speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
190
+ this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
166
191
  speechBufferIndex += toCopyBuffer;
167
- } else if (!speechBufferMaxReached) {
168
- speechBufferMaxReached = true;
192
+ } else if (!this.#speechBufferMaxReached) {
193
+ this.#speechBufferMaxReached = true;
169
194
  this.#logger.warn(
170
195
  "maxBufferedSpeech reached, ignoring further data for the current speech input"
171
196
  );
@@ -192,30 +217,35 @@ class VADStream extends import_agents.VADStream {
192
217
  probability: p,
193
218
  inferenceDuration,
194
219
  frames: [
195
- new import_rtc_node.AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt)
220
+ new import_rtc_node.AudioFrame(
221
+ inputFrame.data.subarray(0, toCopyInt),
222
+ this.#inputSampleRate,
223
+ 1,
224
+ toCopyInt
225
+ )
196
226
  ],
197
227
  speaking: pubSpeaking,
198
228
  rawAccumulatedSilence: silenceThresholdDuration,
199
229
  rawAccumulatedSpeech: speechThresholdDuration
200
230
  });
201
231
  const resetWriteCursor = () => {
202
- if (!speechBuffer) throw new Error("speechBuffer is empty");
203
- if (speechBufferIndex <= pubPrefixPaddingSamples) {
232
+ if (!this.#speechBuffer) throw new Error("speechBuffer is empty");
233
+ if (speechBufferIndex <= this.#prefixPaddingSamples) {
204
234
  return;
205
235
  }
206
- const paddingData = speechBuffer.subarray(
207
- speechBufferIndex - pubPrefixPaddingSamples,
236
+ const paddingData = this.#speechBuffer.subarray(
237
+ speechBufferIndex - this.#prefixPaddingSamples,
208
238
  speechBufferIndex
209
239
  );
210
- speechBuffer.set(paddingData, 0);
211
- speechBufferIndex = pubPrefixPaddingSamples;
212
- speechBufferMaxReached = false;
240
+ this.#speechBuffer.set(paddingData, 0);
241
+ speechBufferIndex = this.#prefixPaddingSamples;
242
+ this.#speechBufferMaxReached = false;
213
243
  };
214
244
  const copySpeechBuffer = () => {
215
- if (!speechBuffer) throw new Error("speechBuffer is empty");
245
+ if (!this.#speechBuffer) throw new Error("speechBuffer is empty");
216
246
  return new import_rtc_node.AudioFrame(
217
- speechBuffer.subarray(0, speechBufferIndex),
218
- pubSampleRate,
247
+ this.#speechBuffer.subarray(0, speechBufferIndex),
248
+ this.#inputSampleRate,
219
249
  1,
220
250
  speechBufferIndex
221
251
  );
@@ -271,7 +301,9 @@ class VADStream extends import_agents.VADStream {
271
301
  inferenceFrames = [];
272
302
  if (inputFrame.data.length > toCopyInt) {
273
303
  const data = inputFrame.data.subarray(toCopyInt);
274
- inputFrames.push(new import_rtc_node.AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
304
+ inputFrames.push(
305
+ new import_rtc_node.AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2))
306
+ );
275
307
  }
276
308
  if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
277
309
  const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
@@ -283,6 +315,32 @@ class VADStream extends import_agents.VADStream {
283
315
  }
284
316
  });
285
317
  }
318
+ /**
319
+ * Update the VAD options
320
+ *
321
+ * @param opts - Partial options object containing the values to update
322
+ * @remarks
323
+ * This method allows you to update the VAD options after the VAD object has been created
324
+ */
325
+ updateOptions(opts) {
326
+ const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;
327
+ this.#opts = { ...this.#opts, ...opts };
328
+ if (this.#inputSampleRate) {
329
+ if (this.#speechBuffer === null) throw new Error("speechBuffer is null");
330
+ this.#prefixPaddingSamples = Math.trunc(
331
+ this.#opts.prefixPaddingDuration * this.#inputSampleRate / 1e3
332
+ );
333
+ const bufferSize = Math.trunc(this.#opts.maxBufferedSpeech * this.#inputSampleRate / 1e3) + this.#prefixPaddingSamples;
334
+ const resizedBuffer = new Int16Array(bufferSize);
335
+ resizedBuffer.set(
336
+ this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize))
337
+ );
338
+ this.#speechBuffer = resizedBuffer;
339
+ if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {
340
+ this.#speechBufferMaxReached = false;
341
+ }
342
+ }
343
+ }
286
344
  }
287
345
  // Annotate the CommonJS export names for ESM import in node:
288
346
  0 && (module.exports = {
package/dist/vad.cjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n return new VADStream(this, this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBuffer: Int16Array | null = null;\n let speechBufferMaxReached = false;\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let pubSampleRate = 0;\n let pubPrefixPaddingSamples = 0; // size in samples of padding data\n\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!pubSampleRate || !speechBuffer) {\n pubSampleRate = frame.sampleRate;\n pubPrefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * pubSampleRate) / 1000,\n );\n\n speechBuffer = new Int16Array(\n this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples,\n );\n\n if (this.#opts.sampleRate !== pubSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n pubSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== pubSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = pubSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!speechBufferMaxReached) {\n speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= pubPrefixPaddingSamples) {\n return;\n }\n\n const paddingData = speechBuffer.subarray(\n speechBufferIndex - pubPrefixPaddingSamples,\n speechBufferIndex,\n );\n speechBuffer.set(paddingData, 0);\n speechBufferIndex = pubPrefixPaddingSamples;\n speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n speechBuffer.subarray(0, speechBufferIndex),\n pubSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AACP,sBAAkE;AAGlE,wBAA+C;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,cAAAA,IAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AAAA,EACf;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,UAAM,uCAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,WAAO,IAAI,UAAU,MAAM,KAAK,OAAO,IAAI,4BAAU,KAAK,UAAU,KAAK,MAAM,UAAU,CAAC;AAAA,EAC5F;AACF;AAEO,MAAM,kBAAkB,cAAAC,UAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,wBAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,cAAU,mBAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAEd,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,eAAkC;AACtC,UAAI,yBAAyB;AAC7B,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,gBAAgB;AACpB,UAAI,0BAA0B;AAE9B,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,iBAAiB,CAAC,cAAc;AACnC,0BAAgB,MAAM;AACtB,oCAA0B,KAAK;AAAA,YAC5B,KAAK,MAAM,wBAAwB,gBAAiB;AAAA,UACvD;AAEA,yBAAe,IAAI;AAAA,YACjB,KAAK,MAAM,oBAAoB,gBAAgB;AAAA,UACjD;AAEA,cAAI,KAAK,MAAM,eAAe,eAAe;AAG3C,wBAAY,IAAI;AAAA,cACd;AAAA,cACA,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sCAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,eAAe;AAC7C,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,iBAAa,2BAAY,WAAW;AAC1C,gBAAM,qBAAiB,2BAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,gBAAgB,KAAK,OAAO;AACpD,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,aAAa,SAAS;AAC7C,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,yBAAa,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AAC7E,iCAAqB;AAAA,UACvB,WAAW,CAAC,wBAAwB;AAClC,qCAAyB;AACzB,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,2BAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI,2BAAW,WAAW,KAAK,SAAS,GAAG,SAAS,GAAG,eAAe,GAAG,SAAS;AAAA,YACpF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,gBAAI,qBAAqB,yBAAyB;AAChD;AAAA,YACF;AAEA,kBAAM,cAAc,aAAa;AAAA,cAC/B,oBAAoB;AAAA,cACpB;AAAA,YACF;AACA,yBAAa,IAAI,aAAa,CAAC;AAC/B,gCAAoB;AACpB,qCAAyB;AAAA,UAC3B;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,mBAAO,IAAI;AAAA,cACT,aAAa,SAAS,GAAG,iBAAiB;AAAA,cAC1C;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY,KAAK,IAAI,2BAAW,MAAM,eAAe,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC;AAAA,UACtF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,2BAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AACF;","names":["baseVAD","baseStream"]}
1
+ {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n #streams: VADStream[];\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n this.#streams = [];\n }\n\n /**\n * Updates the VAD options with new values.\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method will merge the provided options with existing options and update all active streams.\n * Only the properties specified in opts will be updated, other properties retain their current values.\n */\n updateOptions(opts: Partial<VADOptions>): void {\n this.#opts = { ...this.#opts, ...opts };\n for (const stream of this.#streams) {\n stream.updateOptions(this.#opts);\n }\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n const stream = new VADStream(\n this,\n this.#opts,\n new OnnxModel(this.#session, this.#opts.sampleRate),\n );\n this.#streams.push(stream);\n return stream;\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #inputSampleRate: number;\n #speechBuffer: Int16Array | null;\n #speechBufferMaxReached: boolean;\n #prefixPaddingSamples: number;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n this.#inputSampleRate = 0;\n this.#speechBuffer = null;\n this.#speechBufferMaxReached = false;\n this.#prefixPaddingSamples = 0;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!this.#inputSampleRate || !this.#speechBuffer) {\n this.#inputSampleRate = frame.sampleRate;\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n this.#speechBuffer = new Int16Array(bufferSize);\n\n if (this.#opts.sampleRate !== this.#inputSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n this.#inputSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== this.#inputSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = this.#speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!this.#speechBufferMaxReached) {\n this.#speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(\n inputFrame.data.subarray(0, toCopyInt),\n this.#inputSampleRate,\n 1,\n toCopyInt,\n ),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= this.#prefixPaddingSamples) {\n return;\n }\n\n const paddingData = this.#speechBuffer.subarray(\n speechBufferIndex - this.#prefixPaddingSamples,\n speechBufferIndex,\n );\n this.#speechBuffer.set(paddingData, 0);\n speechBufferIndex = this.#prefixPaddingSamples;\n this.#speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n this.#speechBuffer.subarray(0, speechBufferIndex),\n this.#inputSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(\n new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n\n /**\n * Update the VAD options\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method allows you to update the VAD options after the VAD object has been created\n */\n updateOptions(opts: Partial<VADOptions>) {\n const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;\n this.#opts = { ...this.#opts, ...opts };\n\n if (this.#inputSampleRate) {\n // Assert speech buffer exists\n if (this.#speechBuffer === null) throw new Error('speechBuffer is null');\n\n // Resize speech buffer\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n const resizedBuffer = new Int16Array(bufferSize);\n resizedBuffer.set(\n this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),\n );\n this.#speechBuffer = resizedBuffer;\n\n // Determine if max has been reached\n if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {\n this.#speechBufferMaxReached = false;\n }\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AACP,sBAAkE;AAGlE,wBAA+C;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,cAAAA,IAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AACb,SAAK,WAAW,CAAC;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,cAAc,MAAiC;AAC7C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AACtC,eAAW,UAAU,KAAK,UAAU;AAClC,aAAO,cAAc,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,UAAM,uCAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA,KAAK;AAAA,MACL,IAAI,4BAAU,KAAK,UAAU,KAAK,MAAM,UAAU;AAAA,IACpD;AACA,SAAK,SAAS,KAAK,MAAM;AACzB,WAAO;AAAA,EACT;AACF;AAEO,MAAM,kBAAkB,cAAAC,UAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,wBAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,cAAU,mBAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AACd,SAAK,mBAAmB;AACxB,SAAK,gBAAgB;AACrB,SAAK,0BAA0B;AAC/B,SAAK,wBAAwB;AAE7B,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,KAAK,oBAAoB,CAAC,KAAK,eAAe;AACjD,eAAK,mBAAmB,MAAM;AAC9B,eAAK,wBAAwB,KAAK;AAAA,YAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,UAC/D;AACA,gBAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,eAAK,gBAAgB,IAAI,WAAW,UAAU;AAE9C,cAAI,KAAK,MAAM,eAAe,KAAK,kBAAkB;AAGnD,wBAAY,IAAI;AAAA,cACd,KAAK;AAAA,cACL,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sCAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,KAAK,kBAAkB;AACrD,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,iBAAa,2BAAY,WAAW;AAC1C,gBAAM,qBAAiB,2BAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,KAAK,mBAAmB,KAAK,OAAO;AAC5D,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,KAAK,cAAc,SAAS;AACnD,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,iBAAK,cAAc,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AACnF,iCAAqB;AAAA,UACvB,WAAW,CAAC,KAAK,yBAAyB;AACxC,iBAAK,0BAA0B;AAC/B,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,2BAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI;AAAA,gBACF,WAAW,KAAK,SAAS,GAAG,SAAS;AAAA,gBACrC,KAAK;AAAA,gBACL;AAAA,gBACA;AAAA,cACF;AAAA,YACF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,gBAAI,qBAAqB,KAAK,uBAAuB;AACnD;AAAA,YACF;AAEA,kBAAM,cAAc,KAAK,cAAc;AAAA,cACrC,oBAAoB,KAAK;AAAA,cACzB;AAAA,YACF;AACA,iBAAK,cAAc,IAAI,aAAa,CAAC;AACrC,gCAAoB,KAAK;AACzB,iBAAK,0BAA0B;AAAA,UACjC;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,mBAAO,IAAI;AAAA,cACT,KAAK,cAAc,SAAS,GAAG,iBAAiB;AAAA,cAChD,KAAK;AAAA,cACL;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY;AAAA,cACV,IAAI,2BAAW,MAAM,KAAK,kBAAkB,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,2BAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,cAAc,MAA2B;AACvC,UAAM,uBAAuB,KAAK,MAAM;AACxC,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAEtC,QAAI,KAAK,kBAAkB;AAEzB,UAAI,KAAK,kBAAkB,KAAM,OAAM,IAAI,MAAM,sBAAsB;AAGvE,WAAK,wBAAwB,KAAK;AAAA,QAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,MAC/D;AACA,YAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,YAAM,gBAAgB,IAAI,WAAW,UAAU;AAC/C,oBAAc;AAAA,QACZ,KAAK,cAAc,SAAS,GAAG,KAAK,IAAI,KAAK,cAAc,QAAQ,UAAU,CAAC;AAAA,MAChF;AACA,WAAK,gBAAgB;AAGrB,UAAI,KAAK,MAAM,oBAAoB,sBAAsB;AACvD,aAAK,0BAA0B;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;","names":["baseVAD","baseStream"]}
package/dist/vad.d.ts CHANGED
@@ -23,6 +23,15 @@ export declare class VAD extends baseVAD {
23
23
  #private;
24
24
  label: string;
25
25
  constructor(session: InferenceSession, opts: VADOptions);
26
+ /**
27
+ * Updates the VAD options with new values.
28
+ *
29
+ * @param opts - Partial options object containing the values to update
30
+ * @remarks
31
+ * This method will merge the provided options with existing options and update all active streams.
32
+ * Only the properties specified in opts will be updated, other properties retain their current values.
33
+ */
34
+ updateOptions(opts: Partial<VADOptions>): void;
26
35
  /**
27
36
  * Load and initialize the Silero VAD model.
28
37
  *
@@ -55,5 +64,13 @@ export declare class VAD extends baseVAD {
55
64
  export declare class VADStream extends baseStream {
56
65
  #private;
57
66
  constructor(vad: VAD, opts: VADOptions, model: OnnxModel);
67
+ /**
68
+ * Update the VAD options
69
+ *
70
+ * @param opts - Partial options object containing the values to update
71
+ * @remarks
72
+ * This method allows you to update the VAD options after the VAD object has been created
73
+ */
74
+ updateOptions(opts: Partial<VADOptions>): void;
58
75
  }
59
76
  //# sourceMappingURL=vad.d.ts.map
package/dist/vad.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":";AAGA,OAAO,EAGL,SAAS,IAAI,UAAU,EACvB,GAAG,IAAI,OAAO,EAGf,MAAM,iBAAiB,CAAC;AAEzB,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACzD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAuB,MAAM,iBAAiB,CAAC;AAIjE,MAAM,WAAW,UAAU;IACzB,6DAA6D;IAC7D,iBAAiB,EAAE,MAAM,CAAC;IAC1B,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uDAAuD;IACvD,iBAAiB,EAAE,MAAM,CAAC;IAC1B,sDAAsD;IACtD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,wEAAwE;IACxE,UAAU,EAAE,UAAU,CAAC;IACvB,yCAAyC;IACzC,QAAQ,EAAE,OAAO,CAAC;CACnB;AAYD,qBAAa,GAAI,SAAQ,OAAO;;IAG9B,KAAK,SAAgB;gBAET,OAAO,EAAE,gBAAgB,EAAE,IAAI,EAAE,UAAU;IAMvD;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;WACU,IAAI,CAAC,IAAI,GAAE,OAAO,CAAC,UAAU,CAAM,GAAG,OAAO,CAAC,GAAG,CAAC;IAM/D,MAAM,IAAI,SAAS;CAGpB;AAED,qBAAa,SAAU,SAAQ,UAAU;;gBAQ3B,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,SAAS;CAiPzD"}
1
+ {"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":";AAGA,OAAO,EAGL,SAAS,IAAI,UAAU,EACvB,GAAG,IAAI,OAAO,EAGf,MAAM,iBAAiB,CAAC;AAEzB,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACzD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAuB,MAAM,iBAAiB,CAAC;AAIjE,MAAM,WAAW,UAAU;IACzB,6DAA6D;IAC7D,iBAAiB,EAAE,MAAM,CAAC;IAC1B,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uDAAuD;IACvD,iBAAiB,EAAE,MAAM,CAAC;IAC1B,sDAAsD;IACtD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,wEAAwE;IACxE,UAAU,EAAE,UAAU,CAAC;IACvB,yCAAyC;IACzC,QAAQ,EAAE,OAAO,CAAC;CACnB;AAYD,qBAAa,GAAI,SAAQ,OAAO;;IAI9B,KAAK,SAAgB;gBAET,OAAO,EAAE,gBAAgB,EAAE,IAAI,EAAE,UAAU;IAOvD;;;;;;;OAOG;IACH,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI;IAO9C;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;WACU,IAAI,CAAC,IAAI,GAAE,OAAO,CAAC,UAAU,CAAM,GAAG,OAAO,CAAC,GAAG,CAAC;IAM/D,MAAM,IAAI,SAAS;CASpB;AAED,qBAAa,SAAU,SAAQ,UAAU;;gBAY3B,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,SAAS;IAwPxD;;;;;;OAMG;IACH,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,UAAU,CAAC;CA2BxC"}
package/dist/vad.js CHANGED
@@ -21,11 +21,27 @@ const defaultVADOptions = {
21
21
  class VAD extends baseVAD {
22
22
  #session;
23
23
  #opts;
24
+ #streams;
24
25
  label = "silero.VAD";
25
26
  constructor(session, opts) {
26
27
  super({ updateInterval: 32 });
27
28
  this.#session = session;
28
29
  this.#opts = opts;
30
+ this.#streams = [];
31
+ }
32
+ /**
33
+ * Updates the VAD options with new values.
34
+ *
35
+ * @param opts - Partial options object containing the values to update
36
+ * @remarks
37
+ * This method will merge the provided options with existing options and update all active streams.
38
+ * Only the properties specified in opts will be updated, other properties retain their current values.
39
+ */
40
+ updateOptions(opts) {
41
+ this.#opts = { ...this.#opts, ...opts };
42
+ for (const stream of this.#streams) {
43
+ stream.updateOptions(this.#opts);
44
+ }
29
45
  }
30
46
  /**
31
47
  * Load and initialize the Silero VAD model.
@@ -59,12 +75,22 @@ class VAD extends baseVAD {
59
75
  return new VAD(session, mergedOpts);
60
76
  }
61
77
  stream() {
62
- return new VADStream(this, this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));
78
+ const stream = new VADStream(
79
+ this,
80
+ this.#opts,
81
+ new OnnxModel(this.#session, this.#opts.sampleRate)
82
+ );
83
+ this.#streams.push(stream);
84
+ return stream;
63
85
  }
64
86
  }
65
87
  class VADStream extends baseStream {
66
88
  #opts;
67
89
  #model;
90
+ #inputSampleRate;
91
+ #speechBuffer;
92
+ #speechBufferMaxReached;
93
+ #prefixPaddingSamples;
68
94
  #task;
69
95
  #expFilter = new ExpFilter(0.35);
70
96
  #extraInferenceTime = 0;
@@ -73,18 +99,18 @@ class VADStream extends baseStream {
73
99
  super(vad);
74
100
  this.#opts = opts;
75
101
  this.#model = model;
102
+ this.#inputSampleRate = 0;
103
+ this.#speechBuffer = null;
104
+ this.#speechBufferMaxReached = false;
105
+ this.#prefixPaddingSamples = 0;
76
106
  this.#task = new Promise(async () => {
77
107
  let inferenceData = new Float32Array(this.#model.windowSizeSamples);
78
- let speechBuffer = null;
79
- let speechBufferMaxReached = false;
80
108
  let speechBufferIndex = 0;
81
109
  let pubSpeaking = false;
82
110
  let pubSpeechDuration = 0;
83
111
  let pubSilenceDuration = 0;
84
112
  let pubCurrentSample = 0;
85
113
  let pubTimestamp = 0;
86
- let pubSampleRate = 0;
87
- let pubPrefixPaddingSamples = 0;
88
114
  let speechThresholdDuration = 0;
89
115
  let silenceThresholdDuration = 0;
90
116
  let inputFrames = [];
@@ -95,24 +121,23 @@ class VADStream extends baseStream {
95
121
  if (typeof frame === "symbol") {
96
122
  continue;
97
123
  }
98
- if (!pubSampleRate || !speechBuffer) {
99
- pubSampleRate = frame.sampleRate;
100
- pubPrefixPaddingSamples = Math.trunc(
101
- this.#opts.prefixPaddingDuration * pubSampleRate / 1e3
102
- );
103
- speechBuffer = new Int16Array(
104
- this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples
124
+ if (!this.#inputSampleRate || !this.#speechBuffer) {
125
+ this.#inputSampleRate = frame.sampleRate;
126
+ this.#prefixPaddingSamples = Math.trunc(
127
+ this.#opts.prefixPaddingDuration * this.#inputSampleRate / 1e3
105
128
  );
106
- if (this.#opts.sampleRate !== pubSampleRate) {
129
+ const bufferSize = Math.trunc(this.#opts.maxBufferedSpeech * this.#inputSampleRate / 1e3) + this.#prefixPaddingSamples;
130
+ this.#speechBuffer = new Int16Array(bufferSize);
131
+ if (this.#opts.sampleRate !== this.#inputSampleRate) {
107
132
  resampler = new AudioResampler(
108
- pubSampleRate,
133
+ this.#inputSampleRate,
109
134
  this.#opts.sampleRate,
110
135
  1,
111
136
  AudioResamplerQuality.QUICK
112
137
  // VAD doesn't need high quality
113
138
  );
114
139
  }
115
- } else if (frame.sampleRate !== pubSampleRate) {
140
+ } else if (frame.sampleRate !== this.#inputSampleRate) {
116
141
  this.#logger.error("a frame with a different sample rate was already published");
117
142
  continue;
118
143
  }
@@ -138,17 +163,17 @@ class VADStream extends baseStream {
138
163
  const windowDuration = this.#model.windowSizeSamples / this.#opts.sampleRate * 1e3;
139
164
  pubCurrentSample += this.#model.windowSizeSamples;
140
165
  pubTimestamp += windowDuration;
141
- const resamplingRatio = pubSampleRate / this.#model.sampleRate;
166
+ const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;
142
167
  const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
143
168
  const toCopyInt = Math.trunc(toCopy);
144
169
  inputCopyRemainingFrac = toCopy - toCopyInt;
145
- const availableSpace = speechBuffer.length - speechBufferIndex;
170
+ const availableSpace = this.#speechBuffer.length - speechBufferIndex;
146
171
  const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
147
172
  if (toCopyBuffer > 0) {
148
- speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
173
+ this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
149
174
  speechBufferIndex += toCopyBuffer;
150
- } else if (!speechBufferMaxReached) {
151
- speechBufferMaxReached = true;
175
+ } else if (!this.#speechBufferMaxReached) {
176
+ this.#speechBufferMaxReached = true;
152
177
  this.#logger.warn(
153
178
  "maxBufferedSpeech reached, ignoring further data for the current speech input"
154
179
  );
@@ -175,30 +200,35 @@ class VADStream extends baseStream {
175
200
  probability: p,
176
201
  inferenceDuration,
177
202
  frames: [
178
- new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt)
203
+ new AudioFrame(
204
+ inputFrame.data.subarray(0, toCopyInt),
205
+ this.#inputSampleRate,
206
+ 1,
207
+ toCopyInt
208
+ )
179
209
  ],
180
210
  speaking: pubSpeaking,
181
211
  rawAccumulatedSilence: silenceThresholdDuration,
182
212
  rawAccumulatedSpeech: speechThresholdDuration
183
213
  });
184
214
  const resetWriteCursor = () => {
185
- if (!speechBuffer) throw new Error("speechBuffer is empty");
186
- if (speechBufferIndex <= pubPrefixPaddingSamples) {
215
+ if (!this.#speechBuffer) throw new Error("speechBuffer is empty");
216
+ if (speechBufferIndex <= this.#prefixPaddingSamples) {
187
217
  return;
188
218
  }
189
- const paddingData = speechBuffer.subarray(
190
- speechBufferIndex - pubPrefixPaddingSamples,
219
+ const paddingData = this.#speechBuffer.subarray(
220
+ speechBufferIndex - this.#prefixPaddingSamples,
191
221
  speechBufferIndex
192
222
  );
193
- speechBuffer.set(paddingData, 0);
194
- speechBufferIndex = pubPrefixPaddingSamples;
195
- speechBufferMaxReached = false;
223
+ this.#speechBuffer.set(paddingData, 0);
224
+ speechBufferIndex = this.#prefixPaddingSamples;
225
+ this.#speechBufferMaxReached = false;
196
226
  };
197
227
  const copySpeechBuffer = () => {
198
- if (!speechBuffer) throw new Error("speechBuffer is empty");
228
+ if (!this.#speechBuffer) throw new Error("speechBuffer is empty");
199
229
  return new AudioFrame(
200
- speechBuffer.subarray(0, speechBufferIndex),
201
- pubSampleRate,
230
+ this.#speechBuffer.subarray(0, speechBufferIndex),
231
+ this.#inputSampleRate,
202
232
  1,
203
233
  speechBufferIndex
204
234
  );
@@ -254,7 +284,9 @@ class VADStream extends baseStream {
254
284
  inferenceFrames = [];
255
285
  if (inputFrame.data.length > toCopyInt) {
256
286
  const data = inputFrame.data.subarray(toCopyInt);
257
- inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
287
+ inputFrames.push(
288
+ new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2))
289
+ );
258
290
  }
259
291
  if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
260
292
  const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
@@ -266,6 +298,32 @@ class VADStream extends baseStream {
266
298
  }
267
299
  });
268
300
  }
301
+ /**
302
+ * Update the VAD options
303
+ *
304
+ * @param opts - Partial options object containing the values to update
305
+ * @remarks
306
+ * This method allows you to update the VAD options after the VAD object has been created
307
+ */
308
+ updateOptions(opts) {
309
+ const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;
310
+ this.#opts = { ...this.#opts, ...opts };
311
+ if (this.#inputSampleRate) {
312
+ if (this.#speechBuffer === null) throw new Error("speechBuffer is null");
313
+ this.#prefixPaddingSamples = Math.trunc(
314
+ this.#opts.prefixPaddingDuration * this.#inputSampleRate / 1e3
315
+ );
316
+ const bufferSize = Math.trunc(this.#opts.maxBufferedSpeech * this.#inputSampleRate / 1e3) + this.#prefixPaddingSamples;
317
+ const resizedBuffer = new Int16Array(bufferSize);
318
+ resizedBuffer.set(
319
+ this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize))
320
+ );
321
+ this.#speechBuffer = resizedBuffer;
322
+ if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {
323
+ this.#speechBufferMaxReached = false;
324
+ }
325
+ }
326
+ }
269
327
  }
270
328
  export {
271
329
  VAD,
package/dist/vad.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n return new VADStream(this, this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBuffer: Int16Array | null = null;\n let speechBufferMaxReached = false;\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let pubSampleRate = 0;\n let pubPrefixPaddingSamples = 0; // size in samples of padding data\n\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!pubSampleRate || !speechBuffer) {\n pubSampleRate = frame.sampleRate;\n pubPrefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * pubSampleRate) / 1000,\n );\n\n speechBuffer = new Int16Array(\n this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples,\n );\n\n if (this.#opts.sampleRate !== pubSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n pubSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== pubSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = pubSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!speechBufferMaxReached) {\n speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= pubPrefixPaddingSamples) {\n return;\n }\n\n const paddingData = speechBuffer.subarray(\n speechBufferIndex - pubPrefixPaddingSamples,\n speechBufferIndex,\n );\n speechBuffer.set(paddingData, 0);\n speechBufferIndex = pubPrefixPaddingSamples;\n speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n speechBuffer.subarray(0, speechBufferIndex),\n pubSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n}\n"],"mappings":"AAGA;AAAA,EACE;AAAA,EACA;AAAA,EACA,aAAa;AAAA,EACb,OAAO;AAAA,EACP;AAAA,EACA;AAAA,OACK;AACP,SAAS,YAAY,gBAAgB,6BAA6B;AAGlE,SAAS,WAAW,2BAA2B;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,QAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AAAA,EACf;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,MAAM,oBAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,WAAO,IAAI,UAAU,MAAM,KAAK,OAAO,IAAI,UAAU,KAAK,UAAU,KAAK,MAAM,UAAU,CAAC;AAAA,EAC5F;AACF;AAEO,MAAM,kBAAkB,WAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,UAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,UAAU,IAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAEd,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,eAAkC;AACtC,UAAI,yBAAyB;AAC7B,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,gBAAgB;AACpB,UAAI,0BAA0B;AAE9B,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,iBAAiB,CAAC,cAAc;AACnC,0BAAgB,MAAM;AACtB,oCAA0B,KAAK;AAAA,YAC5B,KAAK,MAAM,wBAAwB,gBAAiB;AAAA,UACvD;AAEA,yBAAe,IAAI;AAAA,YACjB,KAAK,MAAM,oBAAoB,gBAAgB;AAAA,UACjD;AAEA,cAAI,KAAK,MAAM,eAAe,eAAe;AAG3C,wBAAY,IAAI;AAAA,cACd;AAAA,cACA,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sBAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,eAAe;AAC7C,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,aAAa,YAAY,WAAW;AAC1C,gBAAM,iBAAiB,YAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,gBAAgB,KAAK,OAAO;AACpD,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,aAAa,SAAS;AAC7C,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,yBAAa,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AAC7E,iCAAqB;AAAA,UACvB,WAAW,CAAC,wBAAwB;AAClC,qCAAyB;AACzB,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,aAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI,WAAW,WAAW,KAAK,SAAS,GAAG,SAAS,GAAG,eAAe,GAAG,SAAS;AAAA,YACpF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,gBAAI,qBAAqB,yBAAyB;AAChD;AAAA,YACF;AAEA,kBAAM,cAAc,aAAa;AAAA,cAC/B,oBAAoB;AAAA,cACpB;AAAA,YACF;AACA,yBAAa,IAAI,aAAa,CAAC;AAC/B,gCAAoB;AACpB,qCAAyB;AAAA,UAC3B;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,mBAAO,IAAI;AAAA,cACT,aAAa,SAAS,GAAG,iBAAiB;AAAA,cAC1C;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY,KAAK,IAAI,WAAW,MAAM,eAAe,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC;AAAA,UACtF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,WAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AACF;","names":[]}
1
+ {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n #streams: VADStream[];\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n this.#streams = [];\n }\n\n /**\n * Updates the VAD options with new values.\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method will merge the provided options with existing options and update all active streams.\n * Only the properties specified in opts will be updated, other properties retain their current values.\n */\n updateOptions(opts: Partial<VADOptions>): void {\n this.#opts = { ...this.#opts, ...opts };\n for (const stream of this.#streams) {\n stream.updateOptions(this.#opts);\n }\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n const stream = new VADStream(\n this,\n this.#opts,\n new OnnxModel(this.#session, this.#opts.sampleRate),\n );\n this.#streams.push(stream);\n return stream;\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #inputSampleRate: number;\n #speechBuffer: Int16Array | null;\n #speechBufferMaxReached: boolean;\n #prefixPaddingSamples: number;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n this.#inputSampleRate = 0;\n this.#speechBuffer = null;\n this.#speechBufferMaxReached = false;\n this.#prefixPaddingSamples = 0;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!this.#inputSampleRate || !this.#speechBuffer) {\n this.#inputSampleRate = frame.sampleRate;\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n this.#speechBuffer = new Int16Array(bufferSize);\n\n if (this.#opts.sampleRate !== this.#inputSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n this.#inputSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== this.#inputSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = this.#speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!this.#speechBufferMaxReached) {\n this.#speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(\n inputFrame.data.subarray(0, toCopyInt),\n this.#inputSampleRate,\n 1,\n toCopyInt,\n ),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= this.#prefixPaddingSamples) {\n return;\n }\n\n const paddingData = this.#speechBuffer.subarray(\n speechBufferIndex - this.#prefixPaddingSamples,\n speechBufferIndex,\n );\n this.#speechBuffer.set(paddingData, 0);\n speechBufferIndex = this.#prefixPaddingSamples;\n this.#speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n this.#speechBuffer.subarray(0, speechBufferIndex),\n this.#inputSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(\n new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n\n /**\n * Update the VAD options\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method allows you to update the VAD options after the VAD object has been created\n */\n updateOptions(opts: Partial<VADOptions>) {\n const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;\n this.#opts = { ...this.#opts, ...opts };\n\n if (this.#inputSampleRate) {\n // Assert speech buffer exists\n if (this.#speechBuffer === null) throw new Error('speechBuffer is null');\n\n // Resize speech buffer\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n const resizedBuffer = new Int16Array(bufferSize);\n resizedBuffer.set(\n this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),\n );\n this.#speechBuffer = resizedBuffer;\n\n // Determine if max has been reached\n if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {\n this.#speechBufferMaxReached = false;\n }\n }\n }\n}\n"],"mappings":"AAGA;AAAA,EACE;AAAA,EACA;AAAA,EACA,aAAa;AAAA,EACb,OAAO;AAAA,EACP;AAAA,EACA;AAAA,OACK;AACP,SAAS,YAAY,gBAAgB,6BAA6B;AAGlE,SAAS,WAAW,2BAA2B;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,QAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AACb,SAAK,WAAW,CAAC;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,cAAc,MAAiC;AAC7C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AACtC,eAAW,UAAU,KAAK,UAAU;AAClC,aAAO,cAAc,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,MAAM,oBAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA,KAAK;AAAA,MACL,IAAI,UAAU,KAAK,UAAU,KAAK,MAAM,UAAU;AAAA,IACpD;AACA,SAAK,SAAS,KAAK,MAAM;AACzB,WAAO;AAAA,EACT;AACF;AAEO,MAAM,kBAAkB,WAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,UAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,UAAU,IAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AACd,SAAK,mBAAmB;AACxB,SAAK,gBAAgB;AACrB,SAAK,0BAA0B;AAC/B,SAAK,wBAAwB;AAE7B,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,KAAK,oBAAoB,CAAC,KAAK,eAAe;AACjD,eAAK,mBAAmB,MAAM;AAC9B,eAAK,wBAAwB,KAAK;AAAA,YAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,UAC/D;AACA,gBAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,eAAK,gBAAgB,IAAI,WAAW,UAAU;AAE9C,cAAI,KAAK,MAAM,eAAe,KAAK,kBAAkB;AAGnD,wBAAY,IAAI;AAAA,cACd,KAAK;AAAA,cACL,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sBAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,KAAK,kBAAkB;AACrD,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,aAAa,YAAY,WAAW;AAC1C,gBAAM,iBAAiB,YAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,KAAK,mBAAmB,KAAK,OAAO;AAC5D,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,KAAK,cAAc,SAAS;AACnD,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,iBAAK,cAAc,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AACnF,iCAAqB;AAAA,UACvB,WAAW,CAAC,KAAK,yBAAyB;AACxC,iBAAK,0BAA0B;AAC/B,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,aAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI;AAAA,gBACF,WAAW,KAAK,SAAS,GAAG,SAAS;AAAA,gBACrC,KAAK;AAAA,gBACL;AAAA,gBACA;AAAA,cACF;AAAA,YACF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,gBAAI,qBAAqB,KAAK,uBAAuB;AACnD;AAAA,YACF;AAEA,kBAAM,cAAc,KAAK,cAAc;AAAA,cACrC,oBAAoB,KAAK;AAAA,cACzB;AAAA,YACF;AACA,iBAAK,cAAc,IAAI,aAAa,CAAC;AACrC,gCAAoB,KAAK;AACzB,iBAAK,0BAA0B;AAAA,UACjC;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,mBAAO,IAAI;AAAA,cACT,KAAK,cAAc,SAAS,GAAG,iBAAiB;AAAA,cAChD,KAAK;AAAA,cACL;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY;AAAA,cACV,IAAI,WAAW,MAAM,KAAK,kBAAkB,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,WAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,cAAc,MAA2B;AACvC,UAAM,uBAAuB,KAAK,MAAM;AACxC,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAEtC,QAAI,KAAK,kBAAkB;AAEzB,UAAI,KAAK,kBAAkB,KAAM,OAAM,IAAI,MAAM,sBAAsB;AAGvE,WAAK,wBAAwB,KAAK;AAAA,QAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,MAC/D;AACA,YAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,YAAM,gBAAgB,IAAI,WAAW,UAAU;AAC/C,oBAAc;AAAA,QACZ,KAAK,cAAc,SAAS,GAAG,KAAK,IAAI,KAAK,cAAc,QAAQ,UAAU,CAAC;AAAA,MAChF;AACA,WAAK,gBAAgB;AAGrB,UAAI,KAAK,MAAM,oBAAoB,sBAAsB;AACvD,aAAK,0BAA0B;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;","names":[]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents-plugin-silero",
3
- "version": "0.5.3",
3
+ "version": "0.5.4",
4
4
  "description": "Silero voice activity detection LiveKit Node Agents",
5
5
  "main": "dist/index.js",
6
6
  "require": "dist/index.cjs",
@@ -23,7 +23,7 @@
23
23
  ],
24
24
  "devDependencies": {
25
25
  "@livekit/agents": "^x",
26
- "@livekit/rtc-node": "^0.13.2",
26
+ "@livekit/rtc-node": "^0.13.4",
27
27
  "@microsoft/api-extractor": "^7.35.0",
28
28
  "@types/ws": "^8.5.10",
29
29
  "onnxruntime-common": "^1.19.2",
@@ -35,8 +35,8 @@
35
35
  "ws": "^8.16.0"
36
36
  },
37
37
  "peerDependencies": {
38
- "@livekit/rtc-node": "^0.13.2",
39
- "@livekit/agents": "^0.6.2x"
38
+ "@livekit/rtc-node": "^0.13.4",
39
+ "@livekit/agents": "^0.7.0x"
40
40
  },
41
41
  "scripts": {
42
42
  "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\" && cp src/silero_vad.onnx dist/",
package/src/vad.ts CHANGED
@@ -46,12 +46,29 @@ const defaultVADOptions: VADOptions = {
46
46
  export class VAD extends baseVAD {
47
47
  #session: InferenceSession;
48
48
  #opts: VADOptions;
49
+ #streams: VADStream[];
49
50
  label = 'silero.VAD';
50
51
 
51
52
  constructor(session: InferenceSession, opts: VADOptions) {
52
53
  super({ updateInterval: 32 });
53
54
  this.#session = session;
54
55
  this.#opts = opts;
56
+ this.#streams = [];
57
+ }
58
+
59
+ /**
60
+ * Updates the VAD options with new values.
61
+ *
62
+ * @param opts - Partial options object containing the values to update
63
+ * @remarks
64
+ * This method will merge the provided options with existing options and update all active streams.
65
+ * Only the properties specified in opts will be updated, other properties retain their current values.
66
+ */
67
+ updateOptions(opts: Partial<VADOptions>): void {
68
+ this.#opts = { ...this.#opts, ...opts };
69
+ for (const stream of this.#streams) {
70
+ stream.updateOptions(this.#opts);
71
+ }
55
72
  }
56
73
 
57
74
  /**
@@ -87,13 +104,23 @@ export class VAD extends baseVAD {
87
104
  }
88
105
 
89
106
  stream(): VADStream {
90
- return new VADStream(this, this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));
107
+ const stream = new VADStream(
108
+ this,
109
+ this.#opts,
110
+ new OnnxModel(this.#session, this.#opts.sampleRate),
111
+ );
112
+ this.#streams.push(stream);
113
+ return stream;
91
114
  }
92
115
  }
93
116
 
94
117
  export class VADStream extends baseStream {
95
118
  #opts: VADOptions;
96
119
  #model: OnnxModel;
120
+ #inputSampleRate: number;
121
+ #speechBuffer: Int16Array | null;
122
+ #speechBufferMaxReached: boolean;
123
+ #prefixPaddingSamples: number;
97
124
  #task: Promise<void>;
98
125
  #expFilter = new ExpFilter(0.35);
99
126
  #extraInferenceTime = 0;
@@ -103,13 +130,15 @@ export class VADStream extends baseStream {
103
130
  super(vad);
104
131
  this.#opts = opts;
105
132
  this.#model = model;
133
+ this.#inputSampleRate = 0;
134
+ this.#speechBuffer = null;
135
+ this.#speechBufferMaxReached = false;
136
+ this.#prefixPaddingSamples = 0;
106
137
 
107
138
  this.#task = new Promise(async () => {
108
139
  let inferenceData = new Float32Array(this.#model.windowSizeSamples);
109
140
 
110
141
  // a copy is exposed to the user in END_OF_SPEECH
111
- let speechBuffer: Int16Array | null = null;
112
- let speechBufferMaxReached = false;
113
142
  let speechBufferIndex = 0;
114
143
 
115
144
  // "pub" means public, these values are exposed to the users through events
@@ -118,9 +147,6 @@ export class VADStream extends baseStream {
118
147
  let pubSilenceDuration = 0;
119
148
  let pubCurrentSample = 0;
120
149
  let pubTimestamp = 0;
121
- let pubSampleRate = 0;
122
- let pubPrefixPaddingSamples = 0; // size in samples of padding data
123
-
124
150
  let speechThresholdDuration = 0;
125
151
  let silenceThresholdDuration = 0;
126
152
 
@@ -136,27 +162,27 @@ export class VADStream extends baseStream {
136
162
  continue; // ignore flush sentinel for now
137
163
  }
138
164
 
139
- if (!pubSampleRate || !speechBuffer) {
140
- pubSampleRate = frame.sampleRate;
141
- pubPrefixPaddingSamples = Math.trunc(
142
- (this.#opts.prefixPaddingDuration * pubSampleRate) / 1000,
143
- );
144
-
145
- speechBuffer = new Int16Array(
146
- this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples,
165
+ if (!this.#inputSampleRate || !this.#speechBuffer) {
166
+ this.#inputSampleRate = frame.sampleRate;
167
+ this.#prefixPaddingSamples = Math.trunc(
168
+ (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,
147
169
  );
170
+ const bufferSize =
171
+ Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +
172
+ this.#prefixPaddingSamples;
173
+ this.#speechBuffer = new Int16Array(bufferSize);
148
174
 
149
- if (this.#opts.sampleRate !== pubSampleRate) {
175
+ if (this.#opts.sampleRate !== this.#inputSampleRate) {
150
176
  // resampling needed: the input sample rate isn't the same as the model's
151
177
  // sample rate used for inference
152
178
  resampler = new AudioResampler(
153
- pubSampleRate,
179
+ this.#inputSampleRate,
154
180
  this.#opts.sampleRate,
155
181
  1,
156
182
  AudioResamplerQuality.QUICK, // VAD doesn't need high quality
157
183
  );
158
184
  }
159
- } else if (frame.sampleRate !== pubSampleRate) {
185
+ } else if (frame.sampleRate !== this.#inputSampleRate) {
160
186
  this.#logger.error('a frame with a different sample rate was already published');
161
187
  continue;
162
188
  }
@@ -194,19 +220,19 @@ export class VADStream extends baseStream {
194
220
  const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;
195
221
  pubCurrentSample += this.#model.windowSizeSamples;
196
222
  pubTimestamp += windowDuration;
197
- const resamplingRatio = pubSampleRate / this.#model.sampleRate;
223
+ const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;
198
224
  const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
199
225
  const toCopyInt = Math.trunc(toCopy);
200
226
  inputCopyRemainingFrac = toCopy - toCopyInt;
201
227
 
202
228
  // copy the inference window to the speech buffer
203
- const availableSpace = speechBuffer.length - speechBufferIndex;
229
+ const availableSpace = this.#speechBuffer.length - speechBufferIndex;
204
230
  const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
205
231
  if (toCopyBuffer > 0) {
206
- speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
232
+ this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
207
233
  speechBufferIndex += toCopyBuffer;
208
- } else if (!speechBufferMaxReached) {
209
- speechBufferMaxReached = true;
234
+ } else if (!this.#speechBufferMaxReached) {
235
+ this.#speechBufferMaxReached = true;
210
236
  this.#logger.warn(
211
237
  'maxBufferedSpeech reached, ignoring further data for the current speech input',
212
238
  );
@@ -238,7 +264,12 @@ export class VADStream extends baseStream {
238
264
  probability: p,
239
265
  inferenceDuration,
240
266
  frames: [
241
- new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt),
267
+ new AudioFrame(
268
+ inputFrame.data.subarray(0, toCopyInt),
269
+ this.#inputSampleRate,
270
+ 1,
271
+ toCopyInt,
272
+ ),
242
273
  ],
243
274
  speaking: pubSpeaking,
244
275
  rawAccumulatedSilence: silenceThresholdDuration,
@@ -246,25 +277,25 @@ export class VADStream extends baseStream {
246
277
  });
247
278
 
248
279
  const resetWriteCursor = () => {
249
- if (!speechBuffer) throw new Error('speechBuffer is empty');
250
- if (speechBufferIndex <= pubPrefixPaddingSamples) {
280
+ if (!this.#speechBuffer) throw new Error('speechBuffer is empty');
281
+ if (speechBufferIndex <= this.#prefixPaddingSamples) {
251
282
  return;
252
283
  }
253
284
 
254
- const paddingData = speechBuffer.subarray(
255
- speechBufferIndex - pubPrefixPaddingSamples,
285
+ const paddingData = this.#speechBuffer.subarray(
286
+ speechBufferIndex - this.#prefixPaddingSamples,
256
287
  speechBufferIndex,
257
288
  );
258
- speechBuffer.set(paddingData, 0);
259
- speechBufferIndex = pubPrefixPaddingSamples;
260
- speechBufferMaxReached = false;
289
+ this.#speechBuffer.set(paddingData, 0);
290
+ speechBufferIndex = this.#prefixPaddingSamples;
291
+ this.#speechBufferMaxReached = false;
261
292
  };
262
293
 
263
294
  const copySpeechBuffer = (): AudioFrame => {
264
- if (!speechBuffer) throw new Error('speechBuffer is empty');
295
+ if (!this.#speechBuffer) throw new Error('speechBuffer is empty');
265
296
  return new AudioFrame(
266
- speechBuffer.subarray(0, speechBufferIndex),
267
- pubSampleRate,
297
+ this.#speechBuffer.subarray(0, speechBufferIndex),
298
+ this.#inputSampleRate,
268
299
  1,
269
300
  speechBufferIndex,
270
301
  );
@@ -328,7 +359,9 @@ export class VADStream extends baseStream {
328
359
 
329
360
  if (inputFrame.data.length > toCopyInt) {
330
361
  const data = inputFrame.data.subarray(toCopyInt);
331
- inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
362
+ inputFrames.push(
363
+ new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),
364
+ );
332
365
  }
333
366
  if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
334
367
  const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
@@ -340,4 +373,39 @@ export class VADStream extends baseStream {
340
373
  }
341
374
  });
342
375
  }
376
+
377
+ /**
378
+ * Update the VAD options
379
+ *
380
+ * @param opts - Partial options object containing the values to update
381
+ * @remarks
382
+ * This method allows you to update the VAD options after the VAD object has been created
383
+ */
384
+ updateOptions(opts: Partial<VADOptions>) {
385
+ const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;
386
+ this.#opts = { ...this.#opts, ...opts };
387
+
388
+ if (this.#inputSampleRate) {
389
+ // Assert speech buffer exists
390
+ if (this.#speechBuffer === null) throw new Error('speechBuffer is null');
391
+
392
+ // Resize speech buffer
393
+ this.#prefixPaddingSamples = Math.trunc(
394
+ (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,
395
+ );
396
+ const bufferSize =
397
+ Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +
398
+ this.#prefixPaddingSamples;
399
+ const resizedBuffer = new Int16Array(bufferSize);
400
+ resizedBuffer.set(
401
+ this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),
402
+ );
403
+ this.#speechBuffer = resizedBuffer;
404
+
405
+ // Determine if max has been reached
406
+ if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {
407
+ this.#speechBufferMaxReached = false;
408
+ }
409
+ }
410
+ }
343
411
  }