@livekit/agents-plugin-silero 0.4.6 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n ExpFilter,\n VADEventType,\n VADStream as baseStream,\n VAD as baseVAD,\n log,\n mergeFrames,\n} from '@livekit/agents';\nimport { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';\nimport type { InferenceSession } from 'onnxruntime-node';\nimport type { SampleRate } from './onnx_model.js';\nimport { OnnxModel, newInferenceSession } from './onnx_model.js';\n\nconst SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms\n\nexport interface VADOptions {\n /** Minimum duration of speech to start a new speech chunk */\n minSpeechDuration: number;\n /** At the end of each speech, wait this duration before ending the speech */\n minSilenceDuration: number;\n /** Duration of padding to add to the beginning of each speech chunk */\n prefixPaddingDuration: number;\n /** Maximum duration of speech to keep in the buffer */\n maxBufferedSpeech: number;\n /** Maximum duration of speech to keep in the buffer*/\n activationThreshold: number;\n /** Sample rate for the inference (only 8KHz and 16KHz are supported) */\n sampleRate: SampleRate;\n /** Force the use of CPU for inference */\n forceCPU: boolean;\n}\n\nconst defaultVADOptions: VADOptions = {\n minSpeechDuration: 50,\n minSilenceDuration: 250,\n prefixPaddingDuration: 500,\n maxBufferedSpeech: 60000,\n activationThreshold: 0.5,\n sampleRate: 16000,\n forceCPU: true,\n};\n\nexport class VAD extends baseVAD {\n #session: InferenceSession;\n #opts: VADOptions;\n label = 'silero.VAD';\n\n constructor(session: InferenceSession, opts: VADOptions) {\n super({ updateInterval: 32 });\n this.#session = session;\n this.#opts = opts;\n }\n\n /**\n * Load and initialize the Silero VAD model.\n *\n * This method loads the ONNX model and prepares it for inference. When options are not provided,\n * sane defaults are used.\n *\n * @remarks\n * This method may take time to load the model into memory.\n * It is recommended to call this method inside your prewarm mechanism.\n *\n * @example\n * ```ts\n * export default defineAgent({\n * prewarm: async (proc: JobProcess) => {\n * proc.userData.vad = await VAD.load();\n * },\n * entry: async (ctx: JobContext) => {\n * const vad = ctx.proc.userData.vad! as VAD;\n * // the rest of your agent logic\n * },\n * });\n * ```\n *\n * @param options -\n * @returns Promise\\<{@link VAD}\\>: An instance of the VAD class ready for streaming.\n */\n static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {\n const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };\n const session = await newInferenceSession(mergedOpts.forceCPU);\n return new VAD(session, mergedOpts);\n }\n\n stream(): VADStream {\n return new VADStream(this, this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));\n }\n}\n\nexport class VADStream extends baseStream {\n #opts: VADOptions;\n #model: OnnxModel;\n #task: Promise<void>;\n #expFilter = new ExpFilter(0.35);\n #extraInferenceTime = 0;\n #logger = log();\n\n constructor(vad: VAD, opts: VADOptions, model: OnnxModel) {\n super(vad);\n this.#opts = opts;\n this.#model = model;\n\n this.#task = new Promise(async () => {\n let inferenceData = new Float32Array(this.#model.windowSizeSamples);\n\n // a copy is exposed to the user in END_OF_SPEECH\n let speechBuffer: Int16Array | null = null;\n let speechBufferMaxReached = false;\n let speechBufferIndex = 0;\n\n // \"pub\" means public, these values are exposed to the users through events\n let pubSpeaking = false;\n let pubSpeechDuration = 0;\n let pubSilenceDuration = 0;\n let pubCurrentSample = 0;\n let pubTimestamp = 0;\n let pubSampleRate = 0;\n let pubPrefixPaddingSamples = 0; // size in samples of padding data\n\n let speechThresholdDuration = 0;\n let silenceThresholdDuration = 0;\n\n let inputFrames = [];\n let inferenceFrames: AudioFrame[] = [];\n let resampler: AudioResampler | null = null;\n\n // used to avoid drift when the sampleRate ratio is not an integer\n let inputCopyRemainingFrac = 0.0;\n\n for await (const frame of this.input) {\n if (typeof frame === 'symbol') {\n continue; // ignore flush sentinel for now\n }\n\n if (!pubSampleRate || !speechBuffer) {\n pubSampleRate = frame.sampleRate;\n pubPrefixPaddingSamples = Math.trunc(\n (this.#opts.prefixPaddingDuration * pubSampleRate) / 1000,\n );\n\n speechBuffer = new Int16Array(\n this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples,\n );\n\n if (this.#opts.sampleRate !== pubSampleRate) {\n // resampling needed: the input sample rate isn't the same as the model's\n // sample rate used for inference\n resampler = new AudioResampler(\n pubSampleRate,\n this.#opts.sampleRate,\n 1,\n AudioResamplerQuality.QUICK, // VAD doesn't need high quality\n );\n }\n } else if (frame.sampleRate !== pubSampleRate) {\n this.#logger.error('a frame with a different sample rate was already published');\n continue;\n }\n\n inputFrames.push(frame);\n if (resampler) {\n inferenceFrames.push(...resampler.push(frame));\n } else {\n inferenceFrames.push(frame);\n }\n\n while (true) {\n const startTime = process.hrtime.bigint();\n const availableInferenceSamples = inferenceFrames\n .map((x) => x.samplesPerChannel)\n .reduce((acc, x) => acc + x, 0);\n\n if (availableInferenceSamples < this.#model.windowSizeSamples) {\n break; // not enough samples to run inference\n }\n\n const inputFrame = mergeFrames(inputFrames);\n const inferenceFrame = mergeFrames(inferenceFrames);\n\n // convert data to f32\n inferenceData = Float32Array.from(\n inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),\n (x) => x / 32767,\n );\n\n const p = await this.#model\n .run(inferenceData)\n .then((data) => this.#expFilter.apply(1, data));\n\n const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;\n pubCurrentSample += this.#model.windowSizeSamples;\n pubTimestamp += windowDuration;\n const resamplingRatio = pubSampleRate / this.#model.sampleRate;\n const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;\n const toCopyInt = Math.trunc(toCopy);\n inputCopyRemainingFrac = toCopy - toCopyInt;\n\n // copy the inference window to the speech buffer\n const availableSpace = speechBuffer.length - speechBufferIndex;\n const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);\n if (toCopyBuffer > 0) {\n speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);\n speechBufferIndex += toCopyBuffer;\n } else if (!speechBufferMaxReached) {\n speechBufferMaxReached = true;\n this.#logger.warn(\n 'maxBufferedSpeech reached, ignoring further data for the current speech input',\n );\n }\n\n const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.#extraInferenceTime = Math.max(\n 0,\n this.#extraInferenceTime + inferenceDuration - windowDuration,\n );\n if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {\n this.#logger\n .child({ delay: this.#extraInferenceTime })\n .warn('inference is slower than realtime');\n }\n\n if (pubSpeaking) {\n pubSpeechDuration += inferenceDuration;\n } else {\n pubSilenceDuration += inferenceDuration;\n }\n\n this.queue.put({\n type: VADEventType.INFERENCE_DONE,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [\n new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt),\n ],\n speaking: pubSpeaking,\n rawAccumulatedSilence: silenceThresholdDuration,\n rawAccumulatedSpeech: speechThresholdDuration,\n });\n\n const resetWriteCursor = () => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n if (speechBufferIndex <= pubPrefixPaddingSamples) {\n return;\n }\n\n const paddingData = speechBuffer.subarray(\n speechBufferIndex - pubPrefixPaddingSamples,\n speechBufferIndex,\n );\n speechBuffer.set(paddingData, 0);\n speechBufferIndex = pubPrefixPaddingSamples;\n speechBufferMaxReached = false;\n };\n\n const copySpeechBuffer = (): AudioFrame => {\n if (!speechBuffer) throw new Error('speechBuffer is empty');\n return new AudioFrame(\n speechBuffer.subarray(0, speechBufferIndex),\n pubSampleRate,\n 1,\n speechBufferIndex,\n );\n };\n\n if (p > this.#opts.activationThreshold) {\n speechThresholdDuration += windowDuration;\n silenceThresholdDuration = 0;\n if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {\n pubSpeaking = true;\n pubSilenceDuration = 0;\n pubSpeechDuration = speechThresholdDuration;\n\n this.queue.put({\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n }\n } else {\n silenceThresholdDuration += windowDuration;\n speechThresholdDuration = 0;\n\n if (!pubSpeaking) {\n resetWriteCursor();\n }\n\n if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {\n pubSpeaking = false;\n pubSpeechDuration = 0;\n pubSilenceDuration = silenceThresholdDuration;\n\n this.queue.put({\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: pubCurrentSample,\n timestamp: pubTimestamp,\n silenceDuration: pubSilenceDuration,\n speechDuration: pubSpeechDuration,\n probability: p,\n inferenceDuration,\n frames: [copySpeechBuffer()],\n speaking: pubSpeaking,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n });\n\n resetWriteCursor();\n }\n }\n\n inputFrames = [];\n inferenceFrames = [];\n\n if (inputFrame.data.length > toCopyInt) {\n const data = inputFrame.data.subarray(toCopyInt);\n inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));\n }\n if (inferenceFrame.data.length > this.#model.windowSizeSamples) {\n const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);\n inferenceFrames.push(\n new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)),\n );\n }\n }\n }\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AACP,sBAAkE;AAGlE,wBAA+C;AAE/C,MAAM,2BAA2B;AAmBjC,MAAM,oBAAgC;AAAA,EACpC,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,uBAAuB;AAAA,EACvB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,YAAY;AAAA,EACZ,UAAU;AACZ;AAEO,MAAM,YAAY,cAAAA,IAAQ;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,SAA2B,MAAkB;AACvD,UAAM,EAAE,gBAAgB,GAAG,CAAC;AAC5B,SAAK,WAAW;AAChB,SAAK,QAAQ;AAAA,EACf;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,aAAa,KAAK,OAA4B,CAAC,GAAiB;AAC9D,UAAM,aAAyB,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC/D,UAAM,UAAU,UAAM,uCAAoB,WAAW,QAAQ;AAC7D,WAAO,IAAI,IAAI,SAAS,UAAU;AAAA,EACpC;AAAA,EAEA,SAAoB;AAClB,WAAO,IAAI,UAAU,MAAM,KAAK,OAAO,IAAI,4BAAU,KAAK,UAAU,KAAK,MAAM,UAAU,CAAC;AAAA,EAC5F;AACF;AAEO,MAAM,kBAAkB,cAAAC,UAAW;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAa,IAAI,wBAAU,IAAI;AAAA,EAC/B,sBAAsB;AAAA,EACtB,cAAU,mBAAI;AAAA,EAEd,YAAY,KAAU,MAAkB,OAAkB;AACxD,UAAM,GAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAEd,SAAK,QAAQ,IAAI,QAAQ,YAAY;AACnC,UAAI,gBAAgB,IAAI,aAAa,KAAK,OAAO,iBAAiB;AAGlE,UAAI,eAAkC;AACtC,UAAI,yBAAyB;AAC7B,UAAI,oBAAoB;AAGxB,UAAI,cAAc;AAClB,UAAI,oBAAoB;AACxB,UAAI,qBAAqB;AACzB,UAAI,mBAAmB;AACvB,UAAI,eAAe;AACnB,UAAI,gBAAgB;AACpB,UAAI,0BAA0B;AAE9B,UAAI,0BAA0B;AAC9B,UAAI,2BAA2B;AAE/B,UAAI,cAAc,CAAC;AACnB,UAAI,kBAAgC,CAAC;AACrC,UAAI,YAAmC;AAGvC,UAAI,yBAAyB;AAE7B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,OAAO,UAAU,UAAU;AAC7B;AAAA,QACF;AAEA,YAAI,CAAC,iBAAiB,CAAC,cAAc;AACnC,0BAAgB,MAAM;AACtB,oCAA0B,KAAK;AAAA,YAC5B,KAAK,MAAM,wBAAwB,gBAAiB;AAAA,UACvD;AAEA,yBAAe,IAAI;AAAA,YACjB,KAAK,MAAM,oBAAoB,gBAAgB;AAAA,UACjD;AAEA,cAAI,KAAK,MAAM,eAAe,eAAe;AAG3C,wBAAY,IAAI;AAAA,cACd;AAAA,cACA,KAAK,MAAM;AAAA,cACX;AAAA,cACA,sCAAsB;AAAA;AAAA,YACxB;AAAA,UACF;AAAA,QACF,WAAW,MAAM,eAAe,eAAe;AAC7C,eAAK,QAAQ,MAAM,4DAA4D;AAC/E;AAAA,QACF;AAEA,oBAAY,KAAK,KAAK;AACtB,YAAI,WAAW;AACb,0BAAgB,KAAK,GAAG,UAAU,KAAK,KAAK,CAAC;AAAA,QAC/C,OAAO;AACL,0BAAgB,KAAK,KAAK;AAAA,QAC5B;AAEA,eAAO,MAAM;AACX,gBAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,gBAAM,4BAA4B,gBAC/B,IAAI,CAAC,MAAM,EAAE,iBAAiB,EAC9B,OAAO,CAAC,KAAK,MAAM,MAAM,GAAG,CAAC;AAEhC,cAAI,4BAA4B,KAAK,OAAO,mBAAmB;AAC7D;AAAA,UACF;AAEA,gBAAM,iBAAa,2BAAY,WAAW;AAC1C,gBAAM,qBAAiB,2BAAY,eAAe;AAGlD,0BAAgB,aAAa;AAAA,YAC3B,eAAe,KAAK,SAAS,GAAG,KAAK,OAAO,iBAAiB;AAAA,YAC7D,CAAC,MAAM,IAAI;AAAA,UACb;AAEA,gBAAM,IAAI,MAAM,KAAK,OAClB,IAAI,aAAa,EACjB,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM,GAAG,IAAI,CAAC;AAEhD,gBAAM,iBAAkB,KAAK,OAAO,oBAAoB,KAAK,MAAM,aAAc;AACjF,8BAAoB,KAAK,OAAO;AAChC,0BAAgB;AAChB,gBAAM,kBAAkB,gBAAgB,KAAK,OAAO;AACpD,gBAAM,SAAS,KAAK,OAAO,oBAAoB,kBAAkB;AACjE,gBAAM,YAAY,KAAK,MAAM,MAAM;AACnC,mCAAyB,SAAS;AAGlC,gBAAM,iBAAiB,aAAa,SAAS;AAC7C,gBAAM,eAAe,KAAK,IAAI,KAAK,OAAO,mBAAmB,cAAc;AAC3E,cAAI,eAAe,GAAG;AACpB,yBAAa,IAAI,WAAW,KAAK,SAAS,GAAG,YAAY,GAAG,iBAAiB;AAC7E,iCAAqB;AAAA,UACvB,WAAW,CAAC,wBAAwB;AAClC,qCAAyB;AACzB,iBAAK,QAAQ;AAAA,cACX;AAAA,YACF;AAAA,UACF;AAEA,gBAAM,oBAAoB,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACxF,eAAK,sBAAsB,KAAK;AAAA,YAC9B;AAAA,YACA,KAAK,sBAAsB,oBAAoB;AAAA,UACjD;AACA,cAAI,KAAK,sBAAsB,0BAA0B;AACvD,iBAAK,QACF,MAAM,EAAE,OAAO,KAAK,oBAAoB,CAAC,EACzC,KAAK,mCAAmC;AAAA,UAC7C;AAEA,cAAI,aAAa;AACf,iCAAqB;AAAA,UACvB,OAAO;AACL,kCAAsB;AAAA,UACxB;AAEA,eAAK,MAAM,IAAI;AAAA,YACb,MAAM,2BAAa;AAAA,YACnB,cAAc;AAAA,YACd,WAAW;AAAA,YACX,iBAAiB;AAAA,YACjB,gBAAgB;AAAA,YAChB,aAAa;AAAA,YACb;AAAA,YACA,QAAQ;AAAA,cACN,IAAI,2BAAW,WAAW,KAAK,SAAS,GAAG,SAAS,GAAG,eAAe,GAAG,SAAS;AAAA,YACpF;AAAA,YACA,UAAU;AAAA,YACV,uBAAuB;AAAA,YACvB,sBAAsB;AAAA,UACxB,CAAC;AAED,gBAAM,mBAAmB,MAAM;AAC7B,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,gBAAI,qBAAqB,yBAAyB;AAChD;AAAA,YACF;AAEA,kBAAM,cAAc,aAAa;AAAA,cAC/B,oBAAoB;AAAA,cACpB;AAAA,YACF;AACA,yBAAa,IAAI,aAAa,CAAC;AAC/B,gCAAoB;AACpB,qCAAyB;AAAA,UAC3B;AAEA,gBAAM,mBAAmB,MAAkB;AACzC,gBAAI,CAAC,aAAc,OAAM,IAAI,MAAM,uBAAuB;AAC1D,mBAAO,IAAI;AAAA,cACT,aAAa,SAAS,GAAG,iBAAiB;AAAA,cAC1C;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAAA,UACF;AAEA,cAAI,IAAI,KAAK,MAAM,qBAAqB;AACtC,uCAA2B;AAC3B,uCAA2B;AAC3B,gBAAI,CAAC,eAAe,2BAA2B,KAAK,MAAM,mBAAmB;AAC3E,4BAAc;AACd,mCAAqB;AACrB,kCAAoB;AAEpB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAAA,YACH;AAAA,UACF,OAAO;AACL,wCAA4B;AAC5B,sCAA0B;AAE1B,gBAAI,CAAC,aAAa;AAChB,+BAAiB;AAAA,YACnB;AAEA,gBAAI,eAAe,2BAA2B,KAAK,MAAM,oBAAoB;AAC3E,4BAAc;AACd,kCAAoB;AACpB,mCAAqB;AAErB,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,2BAAa;AAAA,gBACnB,cAAc;AAAA,gBACd,WAAW;AAAA,gBACX,iBAAiB;AAAA,gBACjB,gBAAgB;AAAA,gBAChB,aAAa;AAAA,gBACb;AAAA,gBACA,QAAQ,CAAC,iBAAiB,CAAC;AAAA,gBAC3B,UAAU;AAAA,gBACV,uBAAuB;AAAA,gBACvB,sBAAsB;AAAA,cACxB,CAAC;AAED,+BAAiB;AAAA,YACnB;AAAA,UACF;AAEA,wBAAc,CAAC;AACf,4BAAkB,CAAC;AAEnB,cAAI,WAAW,KAAK,SAAS,WAAW;AACtC,kBAAM,OAAO,WAAW,KAAK,SAAS,SAAS;AAC/C,wBAAY,KAAK,IAAI,2BAAW,MAAM,eAAe,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC;AAAA,UACtF;AACA,cAAI,eAAe,KAAK,SAAS,KAAK,OAAO,mBAAmB;AAC9D,kBAAM,OAAO,eAAe,KAAK,SAAS,KAAK,OAAO,iBAAiB;AACvE,4BAAgB;AAAA,cACd,IAAI,2BAAW,MAAM,KAAK,MAAM,YAAY,GAAG,KAAK,MAAM,KAAK,SAAS,CAAC,CAAC;AAAA,YAC5E;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AACF;","names":["baseVAD","baseStream"]}
package/dist/vad.d.ts CHANGED
@@ -21,6 +21,7 @@ export interface VADOptions {
21
21
  }
22
22
  export declare class VAD extends baseVAD {
23
23
  #private;
24
+ label: string;
24
25
  constructor(session: InferenceSession, opts: VADOptions);
25
26
  /**
26
27
  * Load and initialize the Silero VAD model.
@@ -48,11 +49,11 @@ export declare class VAD extends baseVAD {
48
49
  * @param options -
49
50
  * @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming.
50
51
  */
51
- static load(opts?: VADOptions): Promise<VAD>;
52
+ static load(opts?: Partial<VADOptions>): Promise<VAD>;
52
53
  stream(): VADStream;
53
54
  }
54
55
  export declare class VADStream extends baseStream {
55
56
  #private;
56
- constructor(opts: VADOptions, model: OnnxModel);
57
+ constructor(vad: VAD, opts: VADOptions, model: OnnxModel);
57
58
  }
58
59
  //# sourceMappingURL=vad.d.ts.map
package/dist/vad.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":";AAGA,OAAO,EAGL,SAAS,IAAI,UAAU,EACvB,GAAG,IAAI,OAAO,EAGf,MAAM,iBAAiB,CAAC;AAEzB,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACzD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAuB,MAAM,iBAAiB,CAAC;AAIjE,MAAM,WAAW,UAAU;IACzB,6DAA6D;IAC7D,iBAAiB,EAAE,MAAM,CAAC;IAC1B,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uDAAuD;IACvD,iBAAiB,EAAE,MAAM,CAAC;IAC1B,sDAAsD;IACtD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,wEAAwE;IACxE,UAAU,EAAE,UAAU,CAAC;IACvB,yCAAyC;IACzC,QAAQ,EAAE,OAAO,CAAC;CACnB;AAYD,qBAAa,GAAI,SAAQ,OAAO;;gBAIlB,OAAO,EAAE,gBAAgB,EAAE,IAAI,EAAE,UAAU;IAMvD;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;WACU,IAAI,CAAC,IAAI,aAAoB,GAAG,OAAO,CAAC,GAAG,CAAC;IAKzD,MAAM,IAAI,SAAS;CAGpB;AAED,qBAAa,SAAU,SAAQ,UAAU;;gBAQ3B,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,SAAS;CA2O/C"}
1
+ {"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":";AAGA,OAAO,EAGL,SAAS,IAAI,UAAU,EACvB,GAAG,IAAI,OAAO,EAGf,MAAM,iBAAiB,CAAC;AAEzB,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACzD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAuB,MAAM,iBAAiB,CAAC;AAIjE,MAAM,WAAW,UAAU;IACzB,6DAA6D;IAC7D,iBAAiB,EAAE,MAAM,CAAC;IAC1B,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uDAAuD;IACvD,iBAAiB,EAAE,MAAM,CAAC;IAC1B,sDAAsD;IACtD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,wEAAwE;IACxE,UAAU,EAAE,UAAU,CAAC;IACvB,yCAAyC;IACzC,QAAQ,EAAE,OAAO,CAAC;CACnB;AAYD,qBAAa,GAAI,SAAQ,OAAO;;IAG9B,KAAK,SAAgB;gBAET,OAAO,EAAE,gBAAgB,EAAE,IAAI,EAAE,UAAU;IAMvD;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;WACU,IAAI,CAAC,IAAI,GAAE,OAAO,CAAC,UAAU,CAAM,GAAG,OAAO,CAAC,GAAG,CAAC;IAM/D,MAAM,IAAI,SAAS;CAGpB;AAED,qBAAa,SAAU,SAAQ,UAAU;;gBAQ3B,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,SAAS;CAiPzD"}
package/dist/vad.js CHANGED
@@ -1,250 +1,274 @@
1
- // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- import { ExpFilter, VADEventType, VADStream as baseStream, VAD as baseVAD, log, mergeFrames, } from '@livekit/agents';
5
- import { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';
6
- import { OnnxModel, newInferenceSession } from './onnx_model.js';
7
- const SLOW_INFERENCE_THRESHOLD = 200; // late by 200ms
1
+ import {
2
+ ExpFilter,
3
+ VADEventType,
4
+ VADStream as baseStream,
5
+ VAD as baseVAD,
6
+ log,
7
+ mergeFrames
8
+ } from "@livekit/agents";
9
+ import { AudioFrame, AudioResampler, AudioResamplerQuality } from "@livekit/rtc-node";
10
+ import { OnnxModel, newInferenceSession } from "./onnx_model.js";
11
+ const SLOW_INFERENCE_THRESHOLD = 200;
8
12
  const defaultVADOptions = {
9
- minSpeechDuration: 50,
10
- minSilenceDuration: 250,
11
- prefixPaddingDuration: 500,
12
- maxBufferedSpeech: 60000,
13
- activationThreshold: 0.5,
14
- sampleRate: 16000,
15
- forceCPU: true,
13
+ minSpeechDuration: 50,
14
+ minSilenceDuration: 250,
15
+ prefixPaddingDuration: 500,
16
+ maxBufferedSpeech: 6e4,
17
+ activationThreshold: 0.5,
18
+ sampleRate: 16e3,
19
+ forceCPU: true
16
20
  };
17
- export class VAD extends baseVAD {
18
- #session;
19
- #opts;
20
- constructor(session, opts) {
21
- super({ updateInterval: 32 });
22
- this.#session = session;
23
- this.#opts = opts;
24
- }
25
- /**
26
- * Load and initialize the Silero VAD model.
27
- *
28
- * This method loads the ONNX model and prepares it for inference. When options are not provided,
29
- * sane defaults are used.
30
- *
31
- * @remarks
32
- * This method may take time to load the model into memory.
33
- * It is recommended to call this method inside your prewarm mechanism.
34
- *
35
- * @example
36
- * ```ts
37
- * export default defineAgent({
38
- * prewarm: async (proc: JobProcess) => {
39
- * proc.userData.vad = await VAD.load();
40
- * },
41
- * entry: async (ctx: JobContext) => {
42
- * const vad = ctx.proc.userData.vad! as VAD;
43
- * // the rest of your agent logic
44
- * },
45
- * });
46
- * ```
47
- *
48
- * @param options -
49
- * @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming.
50
- */
51
- static async load(opts = defaultVADOptions) {
52
- const session = await newInferenceSession(opts.forceCPU);
53
- return new VAD(session, opts);
54
- }
55
- stream() {
56
- return new VADStream(this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));
57
- }
21
+ class VAD extends baseVAD {
22
+ #session;
23
+ #opts;
24
+ label = "silero.VAD";
25
+ constructor(session, opts) {
26
+ super({ updateInterval: 32 });
27
+ this.#session = session;
28
+ this.#opts = opts;
29
+ }
30
+ /**
31
+ * Load and initialize the Silero VAD model.
32
+ *
33
+ * This method loads the ONNX model and prepares it for inference. When options are not provided,
34
+ * sane defaults are used.
35
+ *
36
+ * @remarks
37
+ * This method may take time to load the model into memory.
38
+ * It is recommended to call this method inside your prewarm mechanism.
39
+ *
40
+ * @example
41
+ * ```ts
42
+ * export default defineAgent({
43
+ * prewarm: async (proc: JobProcess) => {
44
+ * proc.userData.vad = await VAD.load();
45
+ * },
46
+ * entry: async (ctx: JobContext) => {
47
+ * const vad = ctx.proc.userData.vad! as VAD;
48
+ * // the rest of your agent logic
49
+ * },
50
+ * });
51
+ * ```
52
+ *
53
+ * @param options -
54
+ * @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming.
55
+ */
56
+ static async load(opts = {}) {
57
+ const mergedOpts = { ...defaultVADOptions, ...opts };
58
+ const session = await newInferenceSession(mergedOpts.forceCPU);
59
+ return new VAD(session, mergedOpts);
60
+ }
61
+ stream() {
62
+ return new VADStream(this, this.#opts, new OnnxModel(this.#session, this.#opts.sampleRate));
63
+ }
58
64
  }
59
- export class VADStream extends baseStream {
60
- #opts;
61
- #model;
62
- #task;
63
- #expFilter = new ExpFilter(0.35);
64
- #extraInferenceTime = 0;
65
- #logger = log();
66
- constructor(opts, model) {
67
- super();
68
- this.#opts = opts;
69
- this.#model = model;
70
- this.#task = new Promise(async () => {
71
- let inferenceData = new Float32Array(this.#model.windowSizeSamples);
72
- // a copy is exposed to the user in END_OF_SPEECH
73
- let speechBuffer = null;
74
- let speechBufferMaxReached = false;
75
- let speechBufferIndex = 0;
76
- // "pub" means public, these values are exposed to the users through events
77
- let pubSpeaking = false;
78
- let pubSpeechDuration = 0;
79
- let pubSilenceDuration = 0;
80
- let pubCurrentSample = 0;
81
- let pubTimestamp = 0;
82
- let pubSampleRate = 0;
83
- let pubPrefixPaddingSamples = 0; // size in samples of padding data
84
- let speechThresholdDuration = 0;
85
- let silenceThresholdDuration = 0;
86
- let inputFrames = [];
87
- let inferenceFrames = [];
88
- let resampler = null;
89
- // used to avoid drift when the sampleRate ratio is not an integer
90
- let inputCopyRemainingFrac = 0.0;
91
- for await (const frame of this.input) {
92
- if (typeof frame === 'symbol') {
93
- continue; // ignore flush sentinel for now
94
- }
95
- if (!pubSampleRate || !speechBuffer) {
96
- pubSampleRate = frame.sampleRate;
97
- pubPrefixPaddingSamples = Math.trunc((this.#opts.prefixPaddingDuration * pubSampleRate) / 1000);
98
- speechBuffer = new Int16Array(this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples);
99
- if (this.#opts.sampleRate !== pubSampleRate) {
100
- // resampling needed: the input sample rate isn't the same as the model's
101
- // sample rate used for inference
102
- resampler = new AudioResampler(pubSampleRate, this.#opts.sampleRate, 1, AudioResamplerQuality.QUICK);
103
- }
104
- }
105
- else if (frame.sampleRate !== pubSampleRate) {
106
- this.#logger.error('a frame with a different sample rate was already published');
107
- continue;
108
- }
109
- inputFrames.push(frame);
110
- if (resampler) {
111
- inferenceFrames.push(...resampler.push(frame));
112
- }
113
- else {
114
- inferenceFrames.push(frame);
115
- }
116
- while (true) {
117
- const startTime = process.hrtime.bigint();
118
- const availableInferenceSamples = inferenceFrames
119
- .map((x) => x.samplesPerChannel)
120
- .reduce((acc, x) => acc + x, 0);
121
- if (availableInferenceSamples < this.#model.windowSizeSamples) {
122
- break; // not enough samples to run inference
123
- }
124
- const inputFrame = mergeFrames(inputFrames);
125
- const inferenceFrame = mergeFrames(inferenceFrames);
126
- // convert data to f32
127
- inferenceData = Float32Array.from(inferenceFrame.data.subarray(0, this.#model.windowSizeSamples), (x) => x / 32767);
128
- const p = await this.#model
129
- .run(inferenceData)
130
- .then((data) => this.#expFilter.apply(1, data));
131
- const windowDuration = (this.#model.windowSizeSamples / this.#opts.sampleRate) * 1000;
132
- pubCurrentSample += this.#model.windowSizeSamples;
133
- pubTimestamp += windowDuration;
134
- const resamplingRatio = pubSampleRate / this.#model.sampleRate;
135
- const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
136
- const toCopyInt = Math.trunc(toCopy);
137
- inputCopyRemainingFrac = toCopy - toCopyInt;
138
- // copy the inference window to the speech buffer
139
- const availableSpace = speechBuffer.length - speechBufferIndex;
140
- const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
141
- if (toCopyBuffer > 0) {
142
- speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
143
- speechBufferIndex += toCopyBuffer;
144
- }
145
- else if (!speechBufferMaxReached) {
146
- speechBufferMaxReached = true;
147
- this.#logger.warn('maxBufferedSpeech reached, ignoring further data for the current speech input');
148
- }
149
- const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
150
- this.#extraInferenceTime = Math.max(0, this.#extraInferenceTime + inferenceDuration - windowDuration);
151
- if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {
152
- this.#logger
153
- .child({ delay: this.#extraInferenceTime })
154
- .warn('inference is slower than realtime');
155
- }
156
- if (pubSpeaking) {
157
- pubSpeechDuration += inferenceDuration;
158
- }
159
- else {
160
- pubSilenceDuration += inferenceDuration;
161
- }
162
- this.queue.put({
163
- type: VADEventType.INFERENCE_DONE,
164
- samplesIndex: pubCurrentSample,
165
- timestamp: pubTimestamp,
166
- silenceDuration: pubSilenceDuration,
167
- speechDuration: pubSpeechDuration,
168
- probability: p,
169
- inferenceDuration,
170
- frames: [
171
- new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt),
172
- ],
173
- speaking: pubSpeaking,
174
- });
175
- const resetWriteCursor = () => {
176
- if (!speechBuffer)
177
- throw new Error('speechBuffer is empty');
178
- if (speechBufferIndex <= pubPrefixPaddingSamples) {
179
- return;
180
- }
181
- const paddingData = speechBuffer.subarray(speechBufferIndex - pubPrefixPaddingSamples, speechBufferIndex);
182
- speechBuffer.set(paddingData, 0);
183
- speechBufferIndex = pubPrefixPaddingSamples;
184
- speechBufferMaxReached = false;
185
- };
186
- const copySpeechBuffer = () => {
187
- if (!speechBuffer)
188
- throw new Error('speechBuffer is empty');
189
- return new AudioFrame(speechBuffer.subarray(0, speechBufferIndex), pubSampleRate, 1, speechBufferIndex);
190
- };
191
- if (p > this.#opts.activationThreshold) {
192
- speechThresholdDuration += windowDuration;
193
- silenceThresholdDuration = 0;
194
- if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {
195
- pubSpeaking = true;
196
- pubSilenceDuration = 0;
197
- pubSpeechDuration = speechThresholdDuration;
198
- this.queue.put({
199
- type: VADEventType.START_OF_SPEECH,
200
- samplesIndex: pubCurrentSample,
201
- timestamp: pubTimestamp,
202
- silenceDuration: pubSilenceDuration,
203
- speechDuration: pubSpeechDuration,
204
- probability: p,
205
- inferenceDuration,
206
- frames: [copySpeechBuffer()],
207
- speaking: pubSpeaking,
208
- });
209
- }
210
- }
211
- else {
212
- silenceThresholdDuration += windowDuration;
213
- speechThresholdDuration = 0;
214
- if (!pubSpeaking) {
215
- resetWriteCursor();
216
- }
217
- if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {
218
- pubSpeaking = false;
219
- pubSpeechDuration = 0;
220
- pubSilenceDuration = silenceThresholdDuration;
221
- this.queue.put({
222
- type: VADEventType.END_OF_SPEECH,
223
- samplesIndex: pubCurrentSample,
224
- timestamp: pubTimestamp,
225
- silenceDuration: pubSilenceDuration,
226
- speechDuration: pubSpeechDuration,
227
- probability: p,
228
- inferenceDuration,
229
- frames: [copySpeechBuffer()],
230
- speaking: pubSpeaking,
231
- });
232
- resetWriteCursor();
233
- }
234
- }
235
- inputFrames = [];
236
- inferenceFrames = [];
237
- if (inputFrame.data.length > toCopyInt) {
238
- const data = inputFrame.data.subarray(toCopyInt);
239
- inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
240
- }
241
- if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
242
- const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
243
- inferenceFrames.push(new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2)));
244
- }
245
- }
65
+ class VADStream extends baseStream {
66
+ #opts;
67
+ #model;
68
+ #task;
69
+ #expFilter = new ExpFilter(0.35);
70
+ #extraInferenceTime = 0;
71
+ #logger = log();
72
+ constructor(vad, opts, model) {
73
+ super(vad);
74
+ this.#opts = opts;
75
+ this.#model = model;
76
+ this.#task = new Promise(async () => {
77
+ let inferenceData = new Float32Array(this.#model.windowSizeSamples);
78
+ let speechBuffer = null;
79
+ let speechBufferMaxReached = false;
80
+ let speechBufferIndex = 0;
81
+ let pubSpeaking = false;
82
+ let pubSpeechDuration = 0;
83
+ let pubSilenceDuration = 0;
84
+ let pubCurrentSample = 0;
85
+ let pubTimestamp = 0;
86
+ let pubSampleRate = 0;
87
+ let pubPrefixPaddingSamples = 0;
88
+ let speechThresholdDuration = 0;
89
+ let silenceThresholdDuration = 0;
90
+ let inputFrames = [];
91
+ let inferenceFrames = [];
92
+ let resampler = null;
93
+ let inputCopyRemainingFrac = 0;
94
+ for await (const frame of this.input) {
95
+ if (typeof frame === "symbol") {
96
+ continue;
97
+ }
98
+ if (!pubSampleRate || !speechBuffer) {
99
+ pubSampleRate = frame.sampleRate;
100
+ pubPrefixPaddingSamples = Math.trunc(
101
+ this.#opts.prefixPaddingDuration * pubSampleRate / 1e3
102
+ );
103
+ speechBuffer = new Int16Array(
104
+ this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples
105
+ );
106
+ if (this.#opts.sampleRate !== pubSampleRate) {
107
+ resampler = new AudioResampler(
108
+ pubSampleRate,
109
+ this.#opts.sampleRate,
110
+ 1,
111
+ AudioResamplerQuality.QUICK
112
+ // VAD doesn't need high quality
113
+ );
114
+ }
115
+ } else if (frame.sampleRate !== pubSampleRate) {
116
+ this.#logger.error("a frame with a different sample rate was already published");
117
+ continue;
118
+ }
119
+ inputFrames.push(frame);
120
+ if (resampler) {
121
+ inferenceFrames.push(...resampler.push(frame));
122
+ } else {
123
+ inferenceFrames.push(frame);
124
+ }
125
+ while (true) {
126
+ const startTime = process.hrtime.bigint();
127
+ const availableInferenceSamples = inferenceFrames.map((x) => x.samplesPerChannel).reduce((acc, x) => acc + x, 0);
128
+ if (availableInferenceSamples < this.#model.windowSizeSamples) {
129
+ break;
130
+ }
131
+ const inputFrame = mergeFrames(inputFrames);
132
+ const inferenceFrame = mergeFrames(inferenceFrames);
133
+ inferenceData = Float32Array.from(
134
+ inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),
135
+ (x) => x / 32767
136
+ );
137
+ const p = await this.#model.run(inferenceData).then((data) => this.#expFilter.apply(1, data));
138
+ const windowDuration = this.#model.windowSizeSamples / this.#opts.sampleRate * 1e3;
139
+ pubCurrentSample += this.#model.windowSizeSamples;
140
+ pubTimestamp += windowDuration;
141
+ const resamplingRatio = pubSampleRate / this.#model.sampleRate;
142
+ const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
143
+ const toCopyInt = Math.trunc(toCopy);
144
+ inputCopyRemainingFrac = toCopy - toCopyInt;
145
+ const availableSpace = speechBuffer.length - speechBufferIndex;
146
+ const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
147
+ if (toCopyBuffer > 0) {
148
+ speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
149
+ speechBufferIndex += toCopyBuffer;
150
+ } else if (!speechBufferMaxReached) {
151
+ speechBufferMaxReached = true;
152
+ this.#logger.warn(
153
+ "maxBufferedSpeech reached, ignoring further data for the current speech input"
154
+ );
155
+ }
156
+ const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1e6));
157
+ this.#extraInferenceTime = Math.max(
158
+ 0,
159
+ this.#extraInferenceTime + inferenceDuration - windowDuration
160
+ );
161
+ if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {
162
+ this.#logger.child({ delay: this.#extraInferenceTime }).warn("inference is slower than realtime");
163
+ }
164
+ if (pubSpeaking) {
165
+ pubSpeechDuration += inferenceDuration;
166
+ } else {
167
+ pubSilenceDuration += inferenceDuration;
168
+ }
169
+ this.queue.put({
170
+ type: VADEventType.INFERENCE_DONE,
171
+ samplesIndex: pubCurrentSample,
172
+ timestamp: pubTimestamp,
173
+ silenceDuration: pubSilenceDuration,
174
+ speechDuration: pubSpeechDuration,
175
+ probability: p,
176
+ inferenceDuration,
177
+ frames: [
178
+ new AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt)
179
+ ],
180
+ speaking: pubSpeaking,
181
+ rawAccumulatedSilence: silenceThresholdDuration,
182
+ rawAccumulatedSpeech: speechThresholdDuration
183
+ });
184
+ const resetWriteCursor = () => {
185
+ if (!speechBuffer) throw new Error("speechBuffer is empty");
186
+ if (speechBufferIndex <= pubPrefixPaddingSamples) {
187
+ return;
246
188
  }
247
- });
248
- }
189
+ const paddingData = speechBuffer.subarray(
190
+ speechBufferIndex - pubPrefixPaddingSamples,
191
+ speechBufferIndex
192
+ );
193
+ speechBuffer.set(paddingData, 0);
194
+ speechBufferIndex = pubPrefixPaddingSamples;
195
+ speechBufferMaxReached = false;
196
+ };
197
+ const copySpeechBuffer = () => {
198
+ if (!speechBuffer) throw new Error("speechBuffer is empty");
199
+ return new AudioFrame(
200
+ speechBuffer.subarray(0, speechBufferIndex),
201
+ pubSampleRate,
202
+ 1,
203
+ speechBufferIndex
204
+ );
205
+ };
206
+ if (p > this.#opts.activationThreshold) {
207
+ speechThresholdDuration += windowDuration;
208
+ silenceThresholdDuration = 0;
209
+ if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {
210
+ pubSpeaking = true;
211
+ pubSilenceDuration = 0;
212
+ pubSpeechDuration = speechThresholdDuration;
213
+ this.queue.put({
214
+ type: VADEventType.START_OF_SPEECH,
215
+ samplesIndex: pubCurrentSample,
216
+ timestamp: pubTimestamp,
217
+ silenceDuration: pubSilenceDuration,
218
+ speechDuration: pubSpeechDuration,
219
+ probability: p,
220
+ inferenceDuration,
221
+ frames: [copySpeechBuffer()],
222
+ speaking: pubSpeaking,
223
+ rawAccumulatedSilence: 0,
224
+ rawAccumulatedSpeech: 0
225
+ });
226
+ }
227
+ } else {
228
+ silenceThresholdDuration += windowDuration;
229
+ speechThresholdDuration = 0;
230
+ if (!pubSpeaking) {
231
+ resetWriteCursor();
232
+ }
233
+ if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {
234
+ pubSpeaking = false;
235
+ pubSpeechDuration = 0;
236
+ pubSilenceDuration = silenceThresholdDuration;
237
+ this.queue.put({
238
+ type: VADEventType.END_OF_SPEECH,
239
+ samplesIndex: pubCurrentSample,
240
+ timestamp: pubTimestamp,
241
+ silenceDuration: pubSilenceDuration,
242
+ speechDuration: pubSpeechDuration,
243
+ probability: p,
244
+ inferenceDuration,
245
+ frames: [copySpeechBuffer()],
246
+ speaking: pubSpeaking,
247
+ rawAccumulatedSilence: 0,
248
+ rawAccumulatedSpeech: 0
249
+ });
250
+ resetWriteCursor();
251
+ }
252
+ }
253
+ inputFrames = [];
254
+ inferenceFrames = [];
255
+ if (inputFrame.data.length > toCopyInt) {
256
+ const data = inputFrame.data.subarray(toCopyInt);
257
+ inputFrames.push(new AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
258
+ }
259
+ if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
260
+ const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
261
+ inferenceFrames.push(
262
+ new AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2))
263
+ );
264
+ }
265
+ }
266
+ }
267
+ });
268
+ }
249
269
  }
270
+ export {
271
+ VAD,
272
+ VADStream
273
+ };
250
274
  //# sourceMappingURL=vad.js.map