@livekit/agents-plugin-silero 0.5.7 → 0.5.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/vad.cjs +5 -4
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.js +5 -4
- package/dist/vad.js.map +1 -1
- package/package.json +4 -4
- package/src/vad.ts +5 -5
package/dist/vad.cjs
CHANGED
|
@@ -28,9 +28,10 @@ var import_onnx_model = require("./onnx_model.cjs");
|
|
|
28
28
|
const SLOW_INFERENCE_THRESHOLD = 200;
|
|
29
29
|
const defaultVADOptions = {
|
|
30
30
|
minSpeechDuration: 50,
|
|
31
|
-
minSilenceDuration:
|
|
31
|
+
minSilenceDuration: 550,
|
|
32
32
|
prefixPaddingDuration: 500,
|
|
33
33
|
maxBufferedSpeech: 6e4,
|
|
34
|
+
// 60 seconds
|
|
34
35
|
activationThreshold: 0.5,
|
|
35
36
|
sampleRate: 16e3,
|
|
36
37
|
forceCPU: true
|
|
@@ -185,7 +186,7 @@ class VADStream extends import_agents.VADStream {
|
|
|
185
186
|
const toCopyInt = Math.trunc(toCopy);
|
|
186
187
|
inputCopyRemainingFrac = toCopy - toCopyInt;
|
|
187
188
|
const availableSpace = this.#speechBuffer.length - speechBufferIndex;
|
|
188
|
-
const toCopyBuffer = Math.min(
|
|
189
|
+
const toCopyBuffer = Math.min(toCopyInt, availableSpace);
|
|
189
190
|
if (toCopyBuffer > 0) {
|
|
190
191
|
this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
191
192
|
speechBufferIndex += toCopyBuffer;
|
|
@@ -204,9 +205,9 @@ class VADStream extends import_agents.VADStream {
|
|
|
204
205
|
this.#logger.child({ delay: this.#extraInferenceTime }).warn("inference is slower than realtime");
|
|
205
206
|
}
|
|
206
207
|
if (pubSpeaking) {
|
|
207
|
-
pubSpeechDuration +=
|
|
208
|
+
pubSpeechDuration += windowDuration;
|
|
208
209
|
} else {
|
|
209
|
-
pubSilenceDuration +=
|
|
210
|
+
pubSilenceDuration += windowDuration;
|
|
210
211
|
}
|
|
211
212
|
this.queue.put({
|
|
212
213
|
type: import_agents.VADEventType.INFERENCE_DONE,
|
package/dist/vad.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n #streams: VADStream[];\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n this.#streams = [];\n }\n\n /**\n * Updates the VAD options with new values.\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method will merge the provided options with existing options and update all active streams.\n * Only the properties specified in opts will be updated, other properties retain their current values.\n */\n updateOptions(opts: Partial<VADOptions>): void {\n this.#opts = { ...this.#opts, ...opts };\n for (const stream of this.#streams) {\n stream.updateOptions(this.#opts);\n }\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n const stream = new VADStream(\n this,\n this.#opts,\n new OnnxModel(this.#session, this.#opts.sampleRate),\n );\n this.#streams.push(stream);\n return stream;\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #inputSampleRate: number;\n #speechBuffer: Int16Array | null;\n #speechBufferMaxReached: boolean;\n #prefixPaddingSamples: number;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n this.#inputSampleRate = 0;\n this.#speechBuffer = null;\n this.#speechBufferMaxReached = false;\n this.#prefixPaddingSamples = 0;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!this.#inputSampleRate || !this.#speechBuffer) {\n this.#inputSampleRate = frame.sampleRate;\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n this.#speechBuffer = new Int16Array(bufferSize);\n\n if (this.#opts.sampleRate !== this.#inputSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n this.#inputSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== this.#inputSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = this.#speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!this.#speechBufferMaxReached) {\n this.#speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(\n inputFrame.data.subarray(0, toCopyInt),\n this.#inputSampleRate,\n 1,\n toCopyInt,\n ),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= this.#prefixPaddingSamples) {\n return;\n }\n\n const paddingData = this.#speechBuffer.subarray(\n speechBufferIndex - this.#prefixPaddingSamples,\n speechBufferIndex,\n );\n this.#speechBuffer.set(paddingData, 0);\n speechBufferIndex = this.#prefixPaddingSamples;\n this.#speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n this.#speechBuffer.subarray(this.#prefixPaddingSamples, speechBufferIndex),\n this.#inputSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(\n new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n\n /**\n * Update the VAD options\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method allows you to update the VAD options after the VAD object has been created\n */\n updateOptions(opts: Partial<VADOptions>) {\n const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;\n this.#opts = { ...this.#opts, ...opts };\n\n if (this.#inputSampleRate) {\n // Assert speech buffer exists\n if (this.#speechBuffer === null) throw new Error('speechBuffer is null');\n\n // Resize speech buffer\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n const resizedBuffer = new Int16Array(bufferSize);\n resizedBuffer.set(\n this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),\n );\n this.#speechBuffer = resizedBuffer;\n\n // Determine if max has been reached\n if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {\n this.#speechBufferMaxReached = false;\n }\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AACP,sBAAkE;AAGlE,wBAA+C;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,cAAAA,IAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AACb,SAAK,WAAW,CAAC;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,cAAc,MAAiC;AAC7C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AACtC,eAAW,UAAU,KAAK,UAAU;AAClC,aAAO,cAAc,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,UAAM,uCAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA,KAAK;AAAA,MACL,IAAI,4BAAU,KAAK,UAAU,KAAK,MAAM,UAAU;AAAA,IACpD;AACA,SAAK,SAAS,KAAK,MAAM;AACzB,WAAO;AAAA,EACT;AACF;AAEO,MAAM,kBAAkB,cAAAC,UAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,wBAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,cAAU,mBAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AACd,SAAK,mBAAmB;AACxB,SAAK,gBAAgB;AACrB,SAAK,0BAA0B;AAC/B,SAAK,wBAAwB;AAE7B,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,KAAK,oBAAoB,CAAC,KAAK,eAAe;AACjD,eAAK,mBAAmB,MAAM;AAC9B,eAAK,wBAAwB,KAAK;AAAA,YAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,UAC/D;AACA,gBAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,eAAK,gBAAgB,IAAI,WAAW,UAAU;AAE9C,cAAI,KAAK,MAAM,eAAe,KAAK,kBAAkB;AAGnD,wBAAY,IAAI;AAAA,cACd,KAAK;AAAA,cACL,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sCAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,KAAK,kBAAkB;AACrD,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,iBAAa,2BAAY,WAAW;AAC1C,gBAAM,qBAAiB,2BAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,KAAK,mBAAmB,KAAK,OAAO;AAC5D,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,KAAK,cAAc,SAAS;AACnD,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,iBAAK,cAAc,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AACnF,iCAAqB;AAAA,UACvB,WAAW,CAAC,KAAK,yBAAyB;AACxC,iBAAK,0BAA0B;AAC/B,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,2BAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI;AAAA,gBACF,WAAW,KAAK,SAAS,GAAG,SAAS;AAAA,gBACrC,KAAK;AAAA,gBACL;AAAA,gBACA;AAAA,cACF;AAAA,YACF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,gBAAI,qBAAqB,KAAK,uBAAuB;AACnD;AAAA,YACF;AAEA,kBAAM,cAAc,KAAK,cAAc;AAAA,cACrC,oBAAoB,KAAK;AAAA,cACzB;AAAA,YACF;AACA,iBAAK,cAAc,IAAI,aAAa,CAAC;AACrC,gCAAoB,KAAK;AACzB,iBAAK,0BAA0B;AAAA,UACjC;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,mBAAO,IAAI;AAAA,cACT,KAAK,cAAc,SAAS,KAAK,uBAAuB,iBAAiB;AAAA,cACzE,KAAK;AAAA,cACL;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY;AAAA,cACV,IAAI,2BAAW,MAAM,KAAK,kBAAkB,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,2BAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,cAAc,MAA2B;AACvC,UAAM,uBAAuB,KAAK,MAAM;AACxC,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAEtC,QAAI,KAAK,kBAAkB;AAEzB,UAAI,KAAK,kBAAkB,KAAM,OAAM,IAAI,MAAM,sBAAsB;AAGvE,WAAK,wBAAwB,KAAK;AAAA,QAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,MAC/D;AACA,YAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,YAAM,gBAAgB,IAAI,WAAW,UAAU;AAC/C,oBAAc;AAAA,QACZ,KAAK,cAAc,SAAS,GAAG,KAAK,IAAI,KAAK,cAAc,QAAQ,UAAU,CAAC;AAAA,MAChF;AACA,WAAK,gBAAgB;AAGrB,UAAI,KAAK,MAAM,oBAAoB,sBAAsB;AACvD,aAAK,0BAA0B;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;","names":["baseVAD","baseStream"]}
|
|
1
|
+
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 550,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000, // 60 seconds\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n #streams: VADStream[];\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n this.#streams = [];\n }\n\n /**\n * Updates the VAD options with new values.\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method will merge the provided options with existing options and update all active streams.\n * Only the properties specified in opts will be updated, other properties retain their current values.\n */\n updateOptions(opts: Partial<VADOptions>): void {\n this.#opts = { ...this.#opts, ...opts };\n for (const stream of this.#streams) {\n stream.updateOptions(this.#opts);\n }\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n const stream = new VADStream(\n this,\n this.#opts,\n new OnnxModel(this.#session, this.#opts.sampleRate),\n );\n this.#streams.push(stream);\n return stream;\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #inputSampleRate: number;\n #speechBuffer: Int16Array | null;\n #speechBufferMaxReached: boolean;\n #prefixPaddingSamples: number;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n this.#inputSampleRate = 0;\n this.#speechBuffer = null;\n this.#speechBufferMaxReached = false;\n this.#prefixPaddingSamples = 0;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!this.#inputSampleRate || !this.#speechBuffer) {\n this.#inputSampleRate = frame.sampleRate;\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n this.#speechBuffer = new Int16Array(bufferSize);\n\n if (this.#opts.sampleRate !== this.#inputSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n this.#inputSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== this.#inputSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = this.#speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(toCopyInt, availableSpace);\n if (toCopyBuffer > 0) {\n this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!this.#speechBufferMaxReached) {\n this.#speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += windowDuration;\n } else {\n pubSilenceDuration += windowDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(\n inputFrame.data.subarray(0, toCopyInt),\n this.#inputSampleRate,\n 1,\n toCopyInt,\n ),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= this.#prefixPaddingSamples) {\n return;\n }\n\n const paddingData = this.#speechBuffer.subarray(\n speechBufferIndex - this.#prefixPaddingSamples,\n speechBufferIndex,\n );\n this.#speechBuffer.set(paddingData, 0);\n speechBufferIndex = this.#prefixPaddingSamples;\n this.#speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n this.#speechBuffer.subarray(this.#prefixPaddingSamples, speechBufferIndex),\n this.#inputSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(\n new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n\n /**\n * Update the VAD options\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method allows you to update the VAD options after the VAD object has been created\n */\n updateOptions(opts: Partial<VADOptions>) {\n const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;\n this.#opts = { ...this.#opts, ...opts };\n\n if (this.#inputSampleRate) {\n // Assert speech buffer exists\n if (this.#speechBuffer === null) throw new Error('speechBuffer is null');\n\n // Resize speech buffer\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n const resizedBuffer = new Int16Array(bufferSize);\n resizedBuffer.set(\n this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),\n );\n this.#speechBuffer = resizedBuffer;\n\n // Determine if max has been reached\n if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {\n this.#speechBufferMaxReached = false;\n }\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AACP,sBAAkE;AAGlE,wBAA+C;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,cAAAA,IAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AACb,SAAK,WAAW,CAAC;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,cAAc,MAAiC;AAC7C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AACtC,eAAW,UAAU,KAAK,UAAU;AAClC,aAAO,cAAc,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,UAAM,uCAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA,KAAK;AAAA,MACL,IAAI,4BAAU,KAAK,UAAU,KAAK,MAAM,UAAU;AAAA,IACpD;AACA,SAAK,SAAS,KAAK,MAAM;AACzB,WAAO;AAAA,EACT;AACF;AAEO,MAAM,kBAAkB,cAAAC,UAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,wBAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,cAAU,mBAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AACd,SAAK,mBAAmB;AACxB,SAAK,gBAAgB;AACrB,SAAK,0BAA0B;AAC/B,SAAK,wBAAwB;AAE7B,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,KAAK,oBAAoB,CAAC,KAAK,eAAe;AACjD,eAAK,mBAAmB,MAAM;AAC9B,eAAK,wBAAwB,KAAK;AAAA,YAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,UAC/D;AACA,gBAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,eAAK,gBAAgB,IAAI,WAAW,UAAU;AAE9C,cAAI,KAAK,MAAM,eAAe,KAAK,kBAAkB;AAGnD,wBAAY,IAAI;AAAA,cACd,KAAK;AAAA,cACL,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sCAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,KAAK,kBAAkB;AACrD,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,iBAAa,2BAAY,WAAW;AAC1C,gBAAM,qBAAiB,2BAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,KAAK,mBAAmB,KAAK,OAAO;AAC5D,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,KAAK,cAAc,SAAS;AACnD,gBAAM,eAAe,KAAK,IAAI,WAAW,cAAc;AACvD,cAAI,eAAe,GAAG;AACpB,iBAAK,cAAc,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AACnF,iCAAqB;AAAA,UACvB,WAAW,CAAC,KAAK,yBAAyB;AACxC,iBAAK,0BAA0B;AAC/B,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,2BAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI;AAAA,gBACF,WAAW,KAAK,SAAS,GAAG,SAAS;AAAA,gBACrC,KAAK;AAAA,gBACL;AAAA,gBACA;AAAA,cACF;AAAA,YACF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,gBAAI,qBAAqB,KAAK,uBAAuB;AACnD;AAAA,YACF;AAEA,kBAAM,cAAc,KAAK,cAAc;AAAA,cACrC,oBAAoB,KAAK;AAAA,cACzB;AAAA,YACF;AACA,iBAAK,cAAc,IAAI,aAAa,CAAC;AACrC,gCAAoB,KAAK;AACzB,iBAAK,0BAA0B;AAAA,UACjC;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,mBAAO,IAAI;AAAA,cACT,KAAK,cAAc,SAAS,KAAK,uBAAuB,iBAAiB;AAAA,cACzE,KAAK;AAAA,cACL;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY;AAAA,cACV,IAAI,2BAAW,MAAM,KAAK,kBAAkB,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,2BAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,cAAc,MAA2B;AACvC,UAAM,uBAAuB,KAAK,MAAM;AACxC,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAEtC,QAAI,KAAK,kBAAkB;AAEzB,UAAI,KAAK,kBAAkB,KAAM,OAAM,IAAI,MAAM,sBAAsB;AAGvE,WAAK,wBAAwB,KAAK;AAAA,QAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,MAC/D;AACA,YAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,YAAM,gBAAgB,IAAI,WAAW,UAAU;AAC/C,oBAAc;AAAA,QACZ,KAAK,cAAc,SAAS,GAAG,KAAK,IAAI,KAAK,cAAc,QAAQ,UAAU,CAAC;AAAA,MAChF;AACA,WAAK,gBAAgB;AAGrB,UAAI,KAAK,MAAM,oBAAoB,sBAAsB;AACvD,aAAK,0BAA0B;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;","names":["baseVAD","baseStream"]}
|
package/dist/vad.js
CHANGED
|
@@ -11,9 +11,10 @@ import { OnnxModel, newInferenceSession } from "./onnx_model.js";
|
|
|
11
11
|
const SLOW_INFERENCE_THRESHOLD = 200;
|
|
12
12
|
const defaultVADOptions = {
|
|
13
13
|
minSpeechDuration: 50,
|
|
14
|
-
minSilenceDuration:
|
|
14
|
+
minSilenceDuration: 550,
|
|
15
15
|
prefixPaddingDuration: 500,
|
|
16
16
|
maxBufferedSpeech: 6e4,
|
|
17
|
+
// 60 seconds
|
|
17
18
|
activationThreshold: 0.5,
|
|
18
19
|
sampleRate: 16e3,
|
|
19
20
|
forceCPU: true
|
|
@@ -168,7 +169,7 @@ class VADStream extends baseStream {
|
|
|
168
169
|
const toCopyInt = Math.trunc(toCopy);
|
|
169
170
|
inputCopyRemainingFrac = toCopy - toCopyInt;
|
|
170
171
|
const availableSpace = this.#speechBuffer.length - speechBufferIndex;
|
|
171
|
-
const toCopyBuffer = Math.min(
|
|
172
|
+
const toCopyBuffer = Math.min(toCopyInt, availableSpace);
|
|
172
173
|
if (toCopyBuffer > 0) {
|
|
173
174
|
this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
174
175
|
speechBufferIndex += toCopyBuffer;
|
|
@@ -187,9 +188,9 @@ class VADStream extends baseStream {
|
|
|
187
188
|
this.#logger.child({ delay: this.#extraInferenceTime }).warn("inference is slower than realtime");
|
|
188
189
|
}
|
|
189
190
|
if (pubSpeaking) {
|
|
190
|
-
pubSpeechDuration +=
|
|
191
|
+
pubSpeechDuration += windowDuration;
|
|
191
192
|
} else {
|
|
192
|
-
pubSilenceDuration +=
|
|
193
|
+
pubSilenceDuration += windowDuration;
|
|
193
194
|
}
|
|
194
195
|
this.queue.put({
|
|
195
196
|
type: VADEventType.INFERENCE_DONE,
|
package/dist/vad.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n #streams: VADStream[];\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n this.#streams = [];\n }\n\n /**\n * Updates the VAD options with new values.\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method will merge the provided options with existing options and update all active streams.\n * Only the properties specified in opts will be updated, other properties retain their current values.\n */\n updateOptions(opts: Partial<VADOptions>): void {\n this.#opts = { ...this.#opts, ...opts };\n for (const stream of this.#streams) {\n stream.updateOptions(this.#opts);\n }\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n const stream = new VADStream(\n this,\n this.#opts,\n new OnnxModel(this.#session, this.#opts.sampleRate),\n );\n this.#streams.push(stream);\n return stream;\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #inputSampleRate: number;\n #speechBuffer: Int16Array | null;\n #speechBufferMaxReached: boolean;\n #prefixPaddingSamples: number;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n this.#inputSampleRate = 0;\n this.#speechBuffer = null;\n this.#speechBufferMaxReached = false;\n this.#prefixPaddingSamples = 0;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!this.#inputSampleRate || !this.#speechBuffer) {\n this.#inputSampleRate = frame.sampleRate;\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n this.#speechBuffer = new Int16Array(bufferSize);\n\n if (this.#opts.sampleRate !== this.#inputSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n this.#inputSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== this.#inputSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = this.#speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!this.#speechBufferMaxReached) {\n this.#speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(\n inputFrame.data.subarray(0, toCopyInt),\n this.#inputSampleRate,\n 1,\n toCopyInt,\n ),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= this.#prefixPaddingSamples) {\n return;\n }\n\n const paddingData = this.#speechBuffer.subarray(\n speechBufferIndex - this.#prefixPaddingSamples,\n speechBufferIndex,\n );\n this.#speechBuffer.set(paddingData, 0);\n speechBufferIndex = this.#prefixPaddingSamples;\n this.#speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n this.#speechBuffer.subarray(this.#prefixPaddingSamples, speechBufferIndex),\n this.#inputSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(\n new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n\n /**\n * Update the VAD options\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method allows you to update the VAD options after the VAD object has been created\n */\n updateOptions(opts: Partial<VADOptions>) {\n const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;\n this.#opts = { ...this.#opts, ...opts };\n\n if (this.#inputSampleRate) {\n // Assert speech buffer exists\n if (this.#speechBuffer === null) throw new Error('speechBuffer is null');\n\n // Resize speech buffer\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n const resizedBuffer = new Int16Array(bufferSize);\n resizedBuffer.set(\n this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),\n );\n this.#speechBuffer = resizedBuffer;\n\n // Determine if max has been reached\n if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {\n this.#speechBufferMaxReached = false;\n }\n }\n }\n}\n"],"mappings":"AAGA;AAAA,EACE;AAAA,EACA;AAAA,EACA,aAAa;AAAA,EACb,OAAO;AAAA,EACP;AAAA,EACA;AAAA,OACK;AACP,SAAS,YAAY,gBAAgB,6BAA6B;AAGlE,SAAS,WAAW,2BAA2B;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,QAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AACb,SAAK,WAAW,CAAC;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,cAAc,MAAiC;AAC7C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AACtC,eAAW,UAAU,KAAK,UAAU;AAClC,aAAO,cAAc,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,MAAM,oBAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA,KAAK;AAAA,MACL,IAAI,UAAU,KAAK,UAAU,KAAK,MAAM,UAAU;AAAA,IACpD;AACA,SAAK,SAAS,KAAK,MAAM;AACzB,WAAO;AAAA,EACT;AACF;AAEO,MAAM,kBAAkB,WAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,UAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,UAAU,IAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AACd,SAAK,mBAAmB;AACxB,SAAK,gBAAgB;AACrB,SAAK,0BAA0B;AAC/B,SAAK,wBAAwB;AAE7B,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,KAAK,oBAAoB,CAAC,KAAK,eAAe;AACjD,eAAK,mBAAmB,MAAM;AAC9B,eAAK,wBAAwB,KAAK;AAAA,YAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,UAC/D;AACA,gBAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,eAAK,gBAAgB,IAAI,WAAW,UAAU;AAE9C,cAAI,KAAK,MAAM,eAAe,KAAK,kBAAkB;AAGnD,wBAAY,IAAI;AAAA,cACd,KAAK;AAAA,cACL,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sBAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,KAAK,kBAAkB;AACrD,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,aAAa,YAAY,WAAW;AAC1C,gBAAM,iBAAiB,YAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,KAAK,mBAAmB,KAAK,OAAO;AAC5D,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,KAAK,cAAc,SAAS;AACnD,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,iBAAK,cAAc,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AACnF,iCAAqB;AAAA,UACvB,WAAW,CAAC,KAAK,yBAAyB;AACxC,iBAAK,0BAA0B;AAC/B,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,aAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI;AAAA,gBACF,WAAW,KAAK,SAAS,GAAG,SAAS;AAAA,gBACrC,KAAK;AAAA,gBACL;AAAA,gBACA;AAAA,cACF;AAAA,YACF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,gBAAI,qBAAqB,KAAK,uBAAuB;AACnD;AAAA,YACF;AAEA,kBAAM,cAAc,KAAK,cAAc;AAAA,cACrC,oBAAoB,KAAK;AAAA,cACzB;AAAA,YACF;AACA,iBAAK,cAAc,IAAI,aAAa,CAAC;AACrC,gCAAoB,KAAK;AACzB,iBAAK,0BAA0B;AAAA,UACjC;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,mBAAO,IAAI;AAAA,cACT,KAAK,cAAc,SAAS,KAAK,uBAAuB,iBAAiB;AAAA,cACzE,KAAK;AAAA,cACL;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY;AAAA,cACV,IAAI,WAAW,MAAM,KAAK,kBAAkB,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,WAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,cAAc,MAA2B;AACvC,UAAM,uBAAuB,KAAK,MAAM;AACxC,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAEtC,QAAI,KAAK,kBAAkB;AAEzB,UAAI,KAAK,kBAAkB,KAAM,OAAM,IAAI,MAAM,sBAAsB;AAGvE,WAAK,wBAAwB,KAAK;AAAA,QAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,MAC/D;AACA,YAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,YAAM,gBAAgB,IAAI,WAAW,UAAU;AAC/C,oBAAc;AAAA,QACZ,KAAK,cAAc,SAAS,GAAG,KAAK,IAAI,KAAK,cAAc,QAAQ,UAAU,CAAC;AAAA,MAChF;AACA,WAAK,gBAAgB;AAGrB,UAAI,KAAK,MAAM,oBAAoB,sBAAsB;AACvD,aAAK,0BAA0B;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 550,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000, // 60 seconds\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n #streams: VADStream[];\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n this.#streams = [];\n }\n\n /**\n * Updates the VAD options with new values.\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method will merge the provided options with existing options and update all active streams.\n * Only the properties specified in opts will be updated, other properties retain their current values.\n */\n updateOptions(opts: Partial<VADOptions>): void {\n this.#opts = { ...this.#opts, ...opts };\n for (const stream of this.#streams) {\n stream.updateOptions(this.#opts);\n }\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n const stream = new VADStream(\n this,\n this.#opts,\n new OnnxModel(this.#session, this.#opts.sampleRate),\n );\n this.#streams.push(stream);\n return stream;\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #inputSampleRate: number;\n #speechBuffer: Int16Array | null;\n #speechBufferMaxReached: boolean;\n #prefixPaddingSamples: number;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n this.#inputSampleRate = 0;\n this.#speechBuffer = null;\n this.#speechBufferMaxReached = false;\n this.#prefixPaddingSamples = 0;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!this.#inputSampleRate || !this.#speechBuffer) {\n this.#inputSampleRate = frame.sampleRate;\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n this.#speechBuffer = new Int16Array(bufferSize);\n\n if (this.#opts.sampleRate !== this.#inputSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n this.#inputSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== this.#inputSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = this.#inputSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = this.#speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(toCopyInt, availableSpace);\n if (toCopyBuffer > 0) {\n this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!this.#speechBufferMaxReached) {\n this.#speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += windowDuration;\n } else {\n pubSilenceDuration += windowDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(\n inputFrame.data.subarray(0, toCopyInt),\n this.#inputSampleRate,\n 1,\n toCopyInt,\n ),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= this.#prefixPaddingSamples) {\n return;\n }\n\n const paddingData = this.#speechBuffer.subarray(\n speechBufferIndex - this.#prefixPaddingSamples,\n speechBufferIndex,\n );\n this.#speechBuffer.set(paddingData, 0);\n speechBufferIndex = this.#prefixPaddingSamples;\n this.#speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!this.#speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n this.#speechBuffer.subarray(this.#prefixPaddingSamples, speechBufferIndex),\n this.#inputSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(\n new AudioFrame(data, this.#inputSampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n\n /**\n * Update the VAD options\n *\n * @param opts - Partial options object containing the values to update\n * @remarks\n * This method allows you to update the VAD options after the VAD object has been created\n */\n updateOptions(opts: Partial<VADOptions>) {\n const oldMaxBufferedSpeech = this.#opts.maxBufferedSpeech;\n this.#opts = { ...this.#opts, ...opts };\n\n if (this.#inputSampleRate) {\n // Assert speech buffer exists\n if (this.#speechBuffer === null) throw new Error('speechBuffer is null');\n\n // Resize speech buffer\n this.#prefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * this.#inputSampleRate) / 1000,\n );\n const bufferSize =\n Math.trunc((this.#opts.maxBufferedSpeech * this.#inputSampleRate) / 1000) +\n this.#prefixPaddingSamples;\n const resizedBuffer = new Int16Array(bufferSize);\n resizedBuffer.set(\n this.#speechBuffer.subarray(0, Math.min(this.#speechBuffer.length, bufferSize)),\n );\n this.#speechBuffer = resizedBuffer;\n\n // Determine if max has been reached\n if (this.#opts.maxBufferedSpeech > oldMaxBufferedSpeech) {\n this.#speechBufferMaxReached = false;\n }\n }\n }\n}\n"],"mappings":"AAGA;AAAA,EACE;AAAA,EACA;AAAA,EACA,aAAa;AAAA,EACb,OAAO;AAAA,EACP;AAAA,EACA;AAAA,OACK;AACP,SAAS,YAAY,gBAAgB,6BAA6B;AAGlE,SAAS,WAAW,2BAA2B;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,QAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AACb,SAAK,WAAW,CAAC;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,cAAc,MAAiC;AAC7C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AACtC,eAAW,UAAU,KAAK,UAAU;AAClC,aAAO,cAAc,KAAK,KAAK;AAAA,IACjC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,MAAM,oBAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA,KAAK;AAAA,MACL,IAAI,UAAU,KAAK,UAAU,KAAK,MAAM,UAAU;AAAA,IACpD;AACA,SAAK,SAAS,KAAK,MAAM;AACzB,WAAO;AAAA,EACT;AACF;AAEO,MAAM,kBAAkB,WAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,UAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,UAAU,IAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AACd,SAAK,mBAAmB;AACxB,SAAK,gBAAgB;AACrB,SAAK,0BAA0B;AAC/B,SAAK,wBAAwB;AAE7B,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,KAAK,oBAAoB,CAAC,KAAK,eAAe;AACjD,eAAK,mBAAmB,MAAM;AAC9B,eAAK,wBAAwB,KAAK;AAAA,YAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,UAC/D;AACA,gBAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,eAAK,gBAAgB,IAAI,WAAW,UAAU;AAE9C,cAAI,KAAK,MAAM,eAAe,KAAK,kBAAkB;AAGnD,wBAAY,IAAI;AAAA,cACd,KAAK;AAAA,cACL,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sBAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,KAAK,kBAAkB;AACrD,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,aAAa,YAAY,WAAW;AAC1C,gBAAM,iBAAiB,YAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,KAAK,mBAAmB,KAAK,OAAO;AAC5D,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,KAAK,cAAc,SAAS;AACnD,gBAAM,eAAe,KAAK,IAAI,WAAW,cAAc;AACvD,cAAI,eAAe,GAAG;AACpB,iBAAK,cAAc,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AACnF,iCAAqB;AAAA,UACvB,WAAW,CAAC,KAAK,yBAAyB;AACxC,iBAAK,0BAA0B;AAC/B,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,aAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI;AAAA,gBACF,WAAW,KAAK,SAAS,GAAG,SAAS;AAAA,gBACrC,KAAK;AAAA,gBACL;AAAA,gBACA;AAAA,cACF;AAAA,YACF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,gBAAI,qBAAqB,KAAK,uBAAuB;AACnD;AAAA,YACF;AAEA,kBAAM,cAAc,KAAK,cAAc;AAAA,cACrC,oBAAoB,KAAK;AAAA,cACzB;AAAA,YACF;AACA,iBAAK,cAAc,IAAI,aAAa,CAAC;AACrC,gCAAoB,KAAK;AACzB,iBAAK,0BAA0B;AAAA,UACjC;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,KAAK,cAAe,OAAM,IAAI,MAAM,uBAAuB;AAChE,mBAAO,IAAI;AAAA,cACT,KAAK,cAAc,SAAS,KAAK,uBAAuB,iBAAiB;AAAA,cACzE,KAAK;AAAA,cACL;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,aAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY;AAAA,cACV,IAAI,WAAW,MAAM,KAAK,kBAAkB,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,WAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,cAAc,MAA2B;AACvC,UAAM,uBAAuB,KAAK,MAAM;AACxC,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAEtC,QAAI,KAAK,kBAAkB;AAEzB,UAAI,KAAK,kBAAkB,KAAM,OAAM,IAAI,MAAM,sBAAsB;AAGvE,WAAK,wBAAwB,KAAK;AAAA,QAC/B,KAAK,MAAM,wBAAwB,KAAK,mBAAoB;AAAA,MAC/D;AACA,YAAM,aACJ,KAAK,MAAO,KAAK,MAAM,oBAAoB,KAAK,mBAAoB,GAAI,IACxE,KAAK;AACP,YAAM,gBAAgB,IAAI,WAAW,UAAU;AAC/C,oBAAc;AAAA,QACZ,KAAK,cAAc,SAAS,GAAG,KAAK,IAAI,KAAK,cAAc,QAAQ,UAAU,CAAC;AAAA,MAChF;AACA,WAAK,gBAAgB;AAGrB,UAAI,KAAK,MAAM,oBAAoB,sBAAsB;AACvD,aAAK,0BAA0B;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;","names":[]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-silero",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.9",
|
|
4
4
|
"description": "Silero voice activity detection LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"require": "dist/index.cjs",
|
|
@@ -29,17 +29,17 @@
|
|
|
29
29
|
"@livekit/rtc-node": "^0.13.11",
|
|
30
30
|
"@microsoft/api-extractor": "^7.35.0",
|
|
31
31
|
"@types/ws": "^8.5.10",
|
|
32
|
-
"onnxruntime-common": "
|
|
32
|
+
"onnxruntime-common": ">=1.19.0 <1.22.0",
|
|
33
33
|
"tsup": "^8.3.5",
|
|
34
34
|
"typescript": "^5.0.0"
|
|
35
35
|
},
|
|
36
36
|
"dependencies": {
|
|
37
|
-
"onnxruntime-node": "
|
|
37
|
+
"onnxruntime-node": ">=1.19.0 <1.22.0",
|
|
38
38
|
"ws": "^8.16.0"
|
|
39
39
|
},
|
|
40
40
|
"peerDependencies": {
|
|
41
41
|
"@livekit/rtc-node": "^0.13.11",
|
|
42
|
-
"@livekit/agents": "^0.7.
|
|
42
|
+
"@livekit/agents": "^0.7.9x"
|
|
43
43
|
},
|
|
44
44
|
"scripts": {
|
|
45
45
|
"build": "tsup --onSuccess \"pnpm build:types\" && cp src/silero_vad.onnx dist/",
|
package/src/vad.ts
CHANGED
|
@@ -35,9 +35,9 @@ export interface VADOptions {
|
|
|
35
35
|
|
|
36
36
|
const defaultVADOptions: VADOptions = {
|
|
37
37
|
minSpeechDuration: 50,
|
|
38
|
-
minSilenceDuration:
|
|
38
|
+
minSilenceDuration: 550,
|
|
39
39
|
prefixPaddingDuration: 500,
|
|
40
|
-
maxBufferedSpeech: 60000,
|
|
40
|
+
maxBufferedSpeech: 60000, // 60 seconds
|
|
41
41
|
activationThreshold: 0.5,
|
|
42
42
|
sampleRate: 16000,
|
|
43
43
|
forceCPU: true,
|
|
@@ -227,7 +227,7 @@ export class VADStream extends baseStream {
|
|
|
227
227
|
|
|
228
228
|
// copy the inference window to the speech buffer
|
|
229
229
|
const availableSpace = this.#speechBuffer.length - speechBufferIndex;
|
|
230
|
-
const toCopyBuffer = Math.min(
|
|
230
|
+
const toCopyBuffer = Math.min(toCopyInt, availableSpace);
|
|
231
231
|
if (toCopyBuffer > 0) {
|
|
232
232
|
this.#speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
|
|
233
233
|
speechBufferIndex += toCopyBuffer;
|
|
@@ -250,9 +250,9 @@ export class VADStream extends baseStream {
|
|
|
250
250
|
}
|
|
251
251
|
|
|
252
252
|
if (pubSpeaking) {
|
|
253
|
-
pubSpeechDuration +=
|
|
253
|
+
pubSpeechDuration += windowDuration;
|
|
254
254
|
} else {
|
|
255
|
-
pubSilenceDuration +=
|
|
255
|
+
pubSilenceDuration += windowDuration;
|
|
256
256
|
}
|
|
257
257
|
|
|
258
258
|
this.queue.put({
|