@livekit/agents 1.0.14 → 1.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +12 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +13 -13
- package/dist/cli.js.map +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -1
- package/dist/inference/tts.d.ts +2 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +1 -5
- package/dist/inference/tts.js.map +1 -1
- package/dist/llm/chat_context.cjs +78 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +16 -0
- package/dist/llm/chat_context.d.ts +16 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +78 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +531 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +531 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/tool_context.cjs +43 -2
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +39 -11
- package/dist/llm/tool_context.d.ts +39 -11
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +42 -3
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/llm/tool_context.test.cjs +197 -0
- package/dist/llm/tool_context.test.cjs.map +1 -1
- package/dist/llm/tool_context.test.js +175 -0
- package/dist/llm/tool_context.test.js.map +1 -1
- package/dist/llm/utils.cjs +17 -11
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +1 -2
- package/dist/llm/utils.d.ts +1 -2
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +17 -11
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/zod-utils.cjs +99 -0
- package/dist/llm/zod-utils.cjs.map +1 -0
- package/dist/llm/zod-utils.d.cts +65 -0
- package/dist/llm/zod-utils.d.ts +65 -0
- package/dist/llm/zod-utils.d.ts.map +1 -0
- package/dist/llm/zod-utils.js +61 -0
- package/dist/llm/zod-utils.js.map +1 -0
- package/dist/llm/zod-utils.test.cjs +389 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -0
- package/dist/llm/zod-utils.test.js +372 -0
- package/dist/llm/zod-utils.test.js.map +1 -0
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +7 -0
- package/dist/metrics/base.d.ts +7 -0
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/stt/stt.cjs +1 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +7 -1
- package/dist/stt/stt.d.ts +7 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +1 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/vad.cjs +16 -0
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +6 -0
- package/dist/vad.d.ts +6 -0
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +16 -0
- package/dist/vad.js.map +1 -1
- package/dist/voice/agent_activity.cjs +83 -8
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +6 -2
- package/dist/voice/agent_activity.d.ts +6 -2
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +83 -8
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +3 -2
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +2 -1
- package/dist/voice/agent_session.d.ts +2 -1
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +3 -2
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +138 -16
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +11 -0
- package/dist/voice/audio_recognition.d.ts +11 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +138 -16
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/generation.cjs +8 -3
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +8 -3
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +0 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/worker.cjs +17 -11
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +16 -9
- package/dist/worker.d.ts +16 -9
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +16 -12
- package/dist/worker.js.map +1 -1
- package/package.json +5 -4
- package/src/cli.ts +17 -17
- package/src/inference/stt.ts +2 -1
- package/src/inference/tts.ts +2 -5
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +341 -0
- package/src/llm/chat_context.test.ts +607 -0
- package/src/llm/chat_context.ts +106 -0
- package/src/llm/tool_context.test.ts +210 -1
- package/src/llm/tool_context.ts +101 -17
- package/src/llm/utils.ts +18 -15
- package/src/llm/zod-utils.test.ts +476 -0
- package/src/llm/zod-utils.ts +144 -0
- package/src/metrics/base.ts +7 -0
- package/src/stt/stt.ts +6 -0
- package/src/vad.ts +18 -0
- package/src/voice/agent_activity.ts +119 -9
- package/src/voice/agent_session.ts +3 -1
- package/src/voice/audio_recognition.ts +235 -57
- package/src/voice/generation.ts +8 -3
- package/src/voice/room_io/_input.ts +1 -1
- package/src/worker.ts +29 -18
package/dist/stt/stt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/stt/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioFrame, AudioResampler } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { ReadableStream } from 'node:stream/web';\nimport { APIConnectionError, APIError } from '../_exceptions.js';\nimport { calculateAudioDurationSeconds } from '../audio.js';\nimport { log } from '../log.js';\nimport type { STTMetrics } from '../metrics/base.js';\nimport { DeferredReadableStream } from '../stream/deferred_stream.js';\nimport { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';\nimport type { AudioBuffer } from '../utils.js';\nimport { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';\n\n/** Indicates start/middle/end of speech */\nexport enum SpeechEventType {\n /**\n * Indicate the start of speech.\n * If the STT doesn't support this event, this will be emitted at the same time\n * as the first INTERIM_TRANSCRIPT.\n */\n START_OF_SPEECH = 0,\n /**\n * Interim transcript, useful for real-time transcription.\n */\n INTERIM_TRANSCRIPT = 1,\n /**\n * Final transcript, emitted when the STT is confident enough that a certain\n * portion of the speech will not change.\n */\n FINAL_TRANSCRIPT = 2,\n /**\n * Indicate the end of speech, emitted when the user stops speaking.\n * The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.\n */\n END_OF_SPEECH = 3,\n /** Usage event, emitted periodically to indicate usage metrics. */\n RECOGNITION_USAGE = 4,\n}\n\n/** SpeechData contains metadata about this {@link SpeechEvent}. */\nexport interface SpeechData {\n language: string;\n text: string;\n startTime: number;\n endTime: number;\n confidence: number;\n}\n\nexport interface RecognitionUsage {\n audioDuration: number;\n}\n\n/** SpeechEvent is a packet of speech-to-text data. */\nexport interface SpeechEvent {\n type: SpeechEventType;\n alternatives?: [SpeechData, ...SpeechData[]];\n requestId?: string;\n recognitionUsage?: RecognitionUsage;\n}\n\n/**\n * Describes the capabilities of the STT provider.\n *\n * @remarks\n * At present, the framework only supports providers that have a streaming endpoint.\n */\nexport interface STTCapabilities {\n streaming: boolean;\n interimResults: boolean;\n}\n\nexport interface STTError {\n type: 'stt_error';\n timestamp: number;\n label: string;\n error: Error;\n recoverable: boolean;\n}\n\nexport type STTCallbacks = {\n ['metrics_collected']: (metrics: STTMetrics) => void;\n ['error']: (error: STTError) => void;\n};\n\n/**\n * An instance of a speech-to-text adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child STT class, which inherits this class's methods.\n */\nexport abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCallbacks>) {\n abstract label: string;\n #capabilities: STTCapabilities;\n\n constructor(capabilities: STTCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n /** Returns this STT's capabilities */\n get capabilities(): STTCapabilities {\n return this.#capabilities;\n }\n\n /** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */\n async recognize(frame: AudioBuffer): Promise<SpeechEvent> {\n const startTime = process.hrtime.bigint();\n const event = await this._recognize(frame);\n const durationMs = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.emit('metrics_collected', {\n type: 'stt_metrics',\n requestId: event.requestId ?? '',\n timestamp: Date.now(),\n durationMs,\n label: this.label,\n audioDurationMs: Math.round(calculateAudioDurationSeconds(frame) * 1000),\n streamed: false,\n });\n return event;\n }\n protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;\n\n /**\n * Returns a {@link SpeechStream} that can be used to push audio frames and receive\n * transcriptions\n */\n abstract stream(): SpeechStream;\n}\n\n/**\n * An instance of a speech-to-text stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {\n * console.log(event.alternatives[0].text)\n * }\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SpeechStream class, which inherits this class's methods.\n */\nexport abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();\n protected output = new AsyncIterableQueue<SpeechEvent>();\n protected queue = new AsyncIterableQueue<SpeechEvent>();\n protected neededSampleRate?: number;\n protected resampler?: AudioResampler;\n abstract label: string;\n protected closed = false;\n #stt: STT;\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n private logger = log();\n private _connOptions: APIConnectOptions;\n\n constructor(\n stt: STT,\n sampleRate?: number,\n connectionOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,\n ) {\n this.#stt = stt;\n this._connOptions = connectionOptions;\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n this.neededSampleRate = sampleRate;\n this.monitorMetrics();\n this.pumpInput();\n\n // this is a hack to immitate asyncio.create_task so that mainTask\n // is run **after** the constructor has finished. Otherwise we get\n // runtime error when trying to access class variables in the\n // `run` method.\n startSoon(() => this.mainTask().then(() => this.queue.close()));\n }\n\n private async mainTask() {\n for (let i = 0; i < this._connOptions.maxRetry + 1; i++) {\n try {\n return await this.run();\n } catch (error) {\n if (error instanceof APIError) {\n const retryInterval = this._connOptions._intervalForRetry(i);\n\n if (this._connOptions.maxRetry === 0 || !error.retryable) {\n this.emitError({ error, recoverable: false });\n throw error;\n } else if (i === this._connOptions.maxRetry) {\n this.emitError({ error, recoverable: false });\n throw new APIConnectionError({\n message: `failed to recognize speech after ${this._connOptions.maxRetry + 1} attempts`,\n options: { retryable: false },\n });\n } else {\n this.emitError({ error, recoverable: true });\n this.logger.warn(\n { tts: this.#stt.label, attempt: i + 1, error },\n `failed to recognize speech, retrying in ${retryInterval}s`,\n );\n }\n\n if (retryInterval > 0) {\n await delay(retryInterval);\n }\n } else {\n this.emitError({ error: toError(error), recoverable: false });\n throw error;\n }\n }\n }\n }\n\n private emitError({ error, recoverable }: { error: Error; recoverable: boolean }) {\n this.#stt.emit('error', {\n type: 'stt_error',\n timestamp: Date.now(),\n label: this.#stt.label,\n error,\n recoverable,\n });\n }\n\n protected async pumpInput() {\n // TODO(AJS-35): Implement STT with webstreams API\n const inputStream = this.deferredInputStream.stream;\n const reader = inputStream.getReader();\n\n try {\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n this.pushFrame(value);\n }\n } catch (error) {\n this.logger.error('Error in STTStream mainTask:', error);\n } finally {\n reader.releaseLock();\n }\n }\n\n protected async monitorMetrics() {\n for await (const event of this.queue) {\n this.output.put(event);\n if (event.type !== SpeechEventType.RECOGNITION_USAGE) continue;\n const metrics: STTMetrics = {\n type: 'stt_metrics',\n timestamp: Date.now(),\n requestId: event.requestId!,\n durationMs: 0,\n label: this.#stt.label,\n audioDurationMs: Math.round(event.recognitionUsage!.audioDuration * 1000),\n streamed: true,\n };\n this.#stt.emit('metrics_collected', metrics);\n }\n this.output.close();\n }\n\n protected abstract run(): Promise<void>;\n\n updateInputStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputStream() {\n this.deferredInputStream.detachSource();\n }\n\n /** Push an audio frame to the STT */\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n if (this.neededSampleRate && frame.sampleRate !== this.neededSampleRate) {\n if (!this.resampler) {\n this.resampler = new AudioResampler(frame.sampleRate, this.neededSampleRate);\n }\n }\n\n if (this.resampler) {\n const frames = this.resampler.push(frame);\n for (const frame of frames) {\n this.input.put(frame);\n }\n } else {\n this.input.put(frame);\n }\n }\n\n /** Flush the STT, causing it to process all pending text */\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SpeechStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SpeechEvent>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the STT stream */\n close() {\n this.input.close();\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SpeechStream {\n return this;\n }\n}\n"],"mappings":"AAGA,SAA0B,sBAAsB;AAEhD,SAAS,oBAAoB;AAE7B,SAAS,oBAAoB,gBAAgB;AAC7C,SAAS,qCAAqC;AAC9C,SAAS,WAAW;AAEpB,SAAS,8BAA8B;AACvC,SAAiC,mCAAmC;AAEpE,SAAS,oBAAoB,OAAO,WAAW,eAAe;AAGvD,IAAK,kBAAL,kBAAKA,qBAAL;AAML,EAAAA,kCAAA,qBAAkB,KAAlB;AAIA,EAAAA,kCAAA,wBAAqB,KAArB;AAKA,EAAAA,kCAAA,sBAAmB,KAAnB;AAKA,EAAAA,kCAAA,mBAAgB,KAAhB;AAEA,EAAAA,kCAAA,uBAAoB,KAApB;AAtBU,SAAAA;AAAA,GAAA;AA6EL,MAAe,YAAa,aAAsD;AAAA,EAEvF;AAAA,EAEA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,MAAM,UAAU,OAA0C;AACxD,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,UAAM,QAAQ,MAAM,KAAK,WAAW,KAAK;AACzC,UAAM,aAAa,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACjF,SAAK,KAAK,qBAAqB;AAAA,MAC7B,MAAM;AAAA,MACN,WAAW,MAAM,aAAa;AAAA,MAC9B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,OAAO,KAAK;AAAA,MACZ,iBAAiB,KAAK,MAAM,8BAA8B,KAAK,IAAI,GAAI;AAAA,MACvE,UAAU;AAAA,IACZ,CAAC;AACD,WAAO;AAAA,EACT;AAQF;AAkBO,MAAe,aAA2D;AAAA,EAC/E,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,mBAAoE;AAAA,EAChF,SAAS,IAAI,mBAAgC;AAAA,EAC7C,QAAQ,IAAI,mBAAgC;AAAA,EAC5C;AAAA,EACA;AAAA,EAEA,SAAS;AAAA,EACnB;AAAA,EACQ;AAAA,EACA,SAAS,IAAI;AAAA,EACb;AAAA,EAER,YACE,KACA,YACA,oBAAuC,6BACvC;AACA,SAAK,OAAO;AACZ,SAAK,eAAe;AACpB,SAAK,sBAAsB,IAAI,uBAAmC;AAClE,SAAK,mBAAmB;AACxB,SAAK,eAAe;AACpB,SAAK,UAAU;AAMf,cAAU,MAAM,KAAK,SAAS,EAAE,KAAK,MAAM,KAAK,MAAM,MAAM,CAAC,CAAC;AAAA,EAChE;AAAA,EAEA,MAAc,WAAW;AACvB,aAAS,IAAI,GAAG,IAAI,KAAK,aAAa,WAAW,GAAG,KAAK;AACvD,UAAI;AACF,eAAO,MAAM,KAAK,IAAI;AAAA,MACxB,SAAS,OAAO;AACd,YAAI,iBAAiB,UAAU;AAC7B,gBAAM,gBAAgB,KAAK,aAAa,kBAAkB,CAAC;AAE3D,cAAI,KAAK,aAAa,aAAa,KAAK,CAAC,MAAM,WAAW;AACxD,iBAAK,UAAU,EAAE,OAAO,aAAa,MAAM,CAAC;AAC5C,kBAAM;AAAA,UACR,WAAW,MAAM,KAAK,aAAa,UAAU;AAC3C,iBAAK,UAAU,EAAE,OAAO,aAAa,MAAM,CAAC;AAC5C,kBAAM,IAAI,mBAAmB;AAAA,cAC3B,SAAS,oCAAoC,KAAK,aAAa,WAAW,CAAC;AAAA,cAC3E,SAAS,EAAE,WAAW,MAAM;AAAA,YAC9B,CAAC;AAAA,UACH,OAAO;AACL,iBAAK,UAAU,EAAE,OAAO,aAAa,KAAK,CAAC;AAC3C,iBAAK,OAAO;AAAA,cACV,EAAE,KAAK,KAAK,KAAK,OAAO,SAAS,IAAI,GAAG,MAAM;AAAA,cAC9C,2CAA2C,aAAa;AAAA,YAC1D;AAAA,UACF;AAEA,cAAI,gBAAgB,GAAG;AACrB,kBAAM,MAAM,aAAa;AAAA,UAC3B;AAAA,QACF,OAAO;AACL,eAAK,UAAU,EAAE,OAAO,QAAQ,KAAK,GAAG,aAAa,MAAM,CAAC;AAC5D,gBAAM;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,UAAU,EAAE,OAAO,YAAY,GAA2C;AAChF,SAAK,KAAK,KAAK,SAAS;AAAA,MACtB,MAAM;AAAA,MACN,WAAW,KAAK,IAAI;AAAA,MACpB,OAAO,KAAK,KAAK;AAAA,MACjB;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAgB,YAAY;AAE1B,UAAM,cAAc,KAAK,oBAAoB;AAC7C,UAAM,SAAS,YAAY,UAAU;AAErC,QAAI;AACF,aAAO,MAAM;AACX,cAAM,EAAE,MAAM,MAAM,IAAI,MAAM,OAAO,KAAK;AAC1C,YAAI,KAAM;AACV,aAAK,UAAU,KAAK;AAAA,MACtB;AAAA,IACF,SAAS,OAAO;AACd,WAAK,OAAO,MAAM,gCAAgC,KAAK;AAAA,IACzD,UAAE;AACA,aAAO,YAAY;AAAA,IACrB;AAAA,EACF;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,MAAM,SAAS,0BAAmC;AACtD,YAAM,UAAsB;AAAA,QAC1B,MAAM;AAAA,QACN,WAAW,KAAK,IAAI;AAAA,QACpB,WAAW,MAAM;AAAA,QACjB,YAAY;AAAA,QACZ,OAAO,KAAK,KAAK;AAAA,QACjB,iBAAiB,KAAK,MAAM,MAAM,iBAAkB,gBAAgB,GAAI;AAAA,QACxE,UAAU;AAAA,MACZ;AACA,WAAK,KAAK,KAAK,qBAAqB,OAAO;AAAA,IAC7C;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA,EAIA,kBAAkB,aAAyC;AACzD,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,oBAAoB;AAClB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA;AAAA,EAGA,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,QAAI,KAAK,oBAAoB,MAAM,eAAe,KAAK,kBAAkB;AACvE,UAAI,CAAC,KAAK,WAAW;AACnB,aAAK,YAAY,IAAI,eAAe,MAAM,YAAY,KAAK,gBAAgB;AAAA,MAC7E;AAAA,IACF;AAEA,QAAI,KAAK,WAAW;AAClB,YAAM,SAAS,KAAK,UAAU,KAAK,KAAK;AACxC,iBAAWC,UAAS,QAAQ;AAC1B,aAAK,MAAM,IAAIA,MAAK;AAAA,MACtB;AAAA,IACF,OAAO;AACL,WAAK,MAAM,IAAI,KAAK;AAAA,IACtB;AAAA,EACF;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,aAAa,cAAc;AAAA,EAC5C;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA6C;AAC3C,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAkB;AACrC,WAAO;AAAA,EACT;AACF;","names":["SpeechEventType","frame"]}
|
|
1
|
+
{"version":3,"sources":["../../src/stt/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioFrame, AudioResampler } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { ReadableStream } from 'node:stream/web';\nimport { APIConnectionError, APIError } from '../_exceptions.js';\nimport { calculateAudioDurationSeconds } from '../audio.js';\nimport { log } from '../log.js';\nimport type { STTMetrics } from '../metrics/base.js';\nimport { DeferredReadableStream } from '../stream/deferred_stream.js';\nimport { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';\nimport type { AudioBuffer } from '../utils.js';\nimport { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';\n\n/** Indicates start/middle/end of speech */\nexport enum SpeechEventType {\n /**\n * Indicate the start of speech.\n * If the STT doesn't support this event, this will be emitted at the same time\n * as the first INTERIM_TRANSCRIPT.\n */\n START_OF_SPEECH = 0,\n /**\n * Interim transcript, useful for real-time transcription.\n */\n INTERIM_TRANSCRIPT = 1,\n /**\n * Final transcript, emitted when the STT is confident enough that a certain\n * portion of the speech will not change.\n */\n FINAL_TRANSCRIPT = 2,\n /**\n * Indicate the end of speech, emitted when the user stops speaking.\n * The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.\n */\n END_OF_SPEECH = 3,\n /** Usage event, emitted periodically to indicate usage metrics. */\n RECOGNITION_USAGE = 4,\n /**\n * Preflight transcript, emitted before final transcript when STT has high confidence\n * but hasn't fully committed yet. Includes all pre-committed transcripts including\n * final transcript from the previous STT run.\n */\n PREFLIGHT_TRANSCRIPT = 5,\n}\n\n/** SpeechData contains metadata about this {@link SpeechEvent}. */\nexport interface SpeechData {\n language: string;\n text: string;\n startTime: number;\n endTime: number;\n confidence: number;\n}\n\nexport interface RecognitionUsage {\n audioDuration: number;\n}\n\n/** SpeechEvent is a packet of speech-to-text data. */\nexport interface SpeechEvent {\n type: SpeechEventType;\n alternatives?: [SpeechData, ...SpeechData[]];\n requestId?: string;\n recognitionUsage?: RecognitionUsage;\n}\n\n/**\n * Describes the capabilities of the STT provider.\n *\n * @remarks\n * At present, the framework only supports providers that have a streaming endpoint.\n */\nexport interface STTCapabilities {\n streaming: boolean;\n interimResults: boolean;\n}\n\nexport interface STTError {\n type: 'stt_error';\n timestamp: number;\n label: string;\n error: Error;\n recoverable: boolean;\n}\n\nexport type STTCallbacks = {\n ['metrics_collected']: (metrics: STTMetrics) => void;\n ['error']: (error: STTError) => void;\n};\n\n/**\n * An instance of a speech-to-text adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child STT class, which inherits this class's methods.\n */\nexport abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCallbacks>) {\n abstract label: string;\n #capabilities: STTCapabilities;\n\n constructor(capabilities: STTCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n /** Returns this STT's capabilities */\n get capabilities(): STTCapabilities {\n return this.#capabilities;\n }\n\n /** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */\n async recognize(frame: AudioBuffer): Promise<SpeechEvent> {\n const startTime = process.hrtime.bigint();\n const event = await this._recognize(frame);\n const durationMs = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.emit('metrics_collected', {\n type: 'stt_metrics',\n requestId: event.requestId ?? '',\n timestamp: Date.now(),\n durationMs,\n label: this.label,\n audioDurationMs: Math.round(calculateAudioDurationSeconds(frame) * 1000),\n streamed: false,\n });\n return event;\n }\n protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;\n\n /**\n * Returns a {@link SpeechStream} that can be used to push audio frames and receive\n * transcriptions\n */\n abstract stream(): SpeechStream;\n}\n\n/**\n * An instance of a speech-to-text stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {\n * console.log(event.alternatives[0].text)\n * }\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SpeechStream class, which inherits this class's methods.\n */\nexport abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();\n protected output = new AsyncIterableQueue<SpeechEvent>();\n protected queue = new AsyncIterableQueue<SpeechEvent>();\n protected neededSampleRate?: number;\n protected resampler?: AudioResampler;\n abstract label: string;\n protected closed = false;\n #stt: STT;\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n private logger = log();\n private _connOptions: APIConnectOptions;\n\n constructor(\n stt: STT,\n sampleRate?: number,\n connectionOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,\n ) {\n this.#stt = stt;\n this._connOptions = connectionOptions;\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n this.neededSampleRate = sampleRate;\n this.monitorMetrics();\n this.pumpInput();\n\n // this is a hack to immitate asyncio.create_task so that mainTask\n // is run **after** the constructor has finished. Otherwise we get\n // runtime error when trying to access class variables in the\n // `run` method.\n startSoon(() => this.mainTask().then(() => this.queue.close()));\n }\n\n private async mainTask() {\n for (let i = 0; i < this._connOptions.maxRetry + 1; i++) {\n try {\n return await this.run();\n } catch (error) {\n if (error instanceof APIError) {\n const retryInterval = this._connOptions._intervalForRetry(i);\n\n if (this._connOptions.maxRetry === 0 || !error.retryable) {\n this.emitError({ error, recoverable: false });\n throw error;\n } else if (i === this._connOptions.maxRetry) {\n this.emitError({ error, recoverable: false });\n throw new APIConnectionError({\n message: `failed to recognize speech after ${this._connOptions.maxRetry + 1} attempts`,\n options: { retryable: false },\n });\n } else {\n this.emitError({ error, recoverable: true });\n this.logger.warn(\n { tts: this.#stt.label, attempt: i + 1, error },\n `failed to recognize speech, retrying in ${retryInterval}s`,\n );\n }\n\n if (retryInterval > 0) {\n await delay(retryInterval);\n }\n } else {\n this.emitError({ error: toError(error), recoverable: false });\n throw error;\n }\n }\n }\n }\n\n private emitError({ error, recoverable }: { error: Error; recoverable: boolean }) {\n this.#stt.emit('error', {\n type: 'stt_error',\n timestamp: Date.now(),\n label: this.#stt.label,\n error,\n recoverable,\n });\n }\n\n protected async pumpInput() {\n // TODO(AJS-35): Implement STT with webstreams API\n const inputStream = this.deferredInputStream.stream;\n const reader = inputStream.getReader();\n\n try {\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n this.pushFrame(value);\n }\n } catch (error) {\n this.logger.error('Error in STTStream mainTask:', error);\n } finally {\n reader.releaseLock();\n }\n }\n\n protected async monitorMetrics() {\n for await (const event of this.queue) {\n this.output.put(event);\n if (event.type !== SpeechEventType.RECOGNITION_USAGE) continue;\n const metrics: STTMetrics = {\n type: 'stt_metrics',\n timestamp: Date.now(),\n requestId: event.requestId!,\n durationMs: 0,\n label: this.#stt.label,\n audioDurationMs: Math.round(event.recognitionUsage!.audioDuration * 1000),\n streamed: true,\n };\n this.#stt.emit('metrics_collected', metrics);\n }\n this.output.close();\n }\n\n protected abstract run(): Promise<void>;\n\n updateInputStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputStream() {\n this.deferredInputStream.detachSource();\n }\n\n /** Push an audio frame to the STT */\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n if (this.neededSampleRate && frame.sampleRate !== this.neededSampleRate) {\n if (!this.resampler) {\n this.resampler = new AudioResampler(frame.sampleRate, this.neededSampleRate);\n }\n }\n\n if (this.resampler) {\n const frames = this.resampler.push(frame);\n for (const frame of frames) {\n this.input.put(frame);\n }\n } else {\n this.input.put(frame);\n }\n }\n\n /** Flush the STT, causing it to process all pending text */\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SpeechStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SpeechEvent>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the STT stream */\n close() {\n this.input.close();\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SpeechStream {\n return this;\n }\n}\n"],"mappings":"AAGA,SAA0B,sBAAsB;AAEhD,SAAS,oBAAoB;AAE7B,SAAS,oBAAoB,gBAAgB;AAC7C,SAAS,qCAAqC;AAC9C,SAAS,WAAW;AAEpB,SAAS,8BAA8B;AACvC,SAAiC,mCAAmC;AAEpE,SAAS,oBAAoB,OAAO,WAAW,eAAe;AAGvD,IAAK,kBAAL,kBAAKA,qBAAL;AAML,EAAAA,kCAAA,qBAAkB,KAAlB;AAIA,EAAAA,kCAAA,wBAAqB,KAArB;AAKA,EAAAA,kCAAA,sBAAmB,KAAnB;AAKA,EAAAA,kCAAA,mBAAgB,KAAhB;AAEA,EAAAA,kCAAA,uBAAoB,KAApB;AAMA,EAAAA,kCAAA,0BAAuB,KAAvB;AA5BU,SAAAA;AAAA,GAAA;AAmFL,MAAe,YAAa,aAAsD;AAAA,EAEvF;AAAA,EAEA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,MAAM,UAAU,OAA0C;AACxD,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,UAAM,QAAQ,MAAM,KAAK,WAAW,KAAK;AACzC,UAAM,aAAa,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACjF,SAAK,KAAK,qBAAqB;AAAA,MAC7B,MAAM;AAAA,MACN,WAAW,MAAM,aAAa;AAAA,MAC9B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,OAAO,KAAK;AAAA,MACZ,iBAAiB,KAAK,MAAM,8BAA8B,KAAK,IAAI,GAAI;AAAA,MACvE,UAAU;AAAA,IACZ,CAAC;AACD,WAAO;AAAA,EACT;AAQF;AAkBO,MAAe,aAA2D;AAAA,EAC/E,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,mBAAoE;AAAA,EAChF,SAAS,IAAI,mBAAgC;AAAA,EAC7C,QAAQ,IAAI,mBAAgC;AAAA,EAC5C;AAAA,EACA;AAAA,EAEA,SAAS;AAAA,EACnB;AAAA,EACQ;AAAA,EACA,SAAS,IAAI;AAAA,EACb;AAAA,EAER,YACE,KACA,YACA,oBAAuC,6BACvC;AACA,SAAK,OAAO;AACZ,SAAK,eAAe;AACpB,SAAK,sBAAsB,IAAI,uBAAmC;AAClE,SAAK,mBAAmB;AACxB,SAAK,eAAe;AACpB,SAAK,UAAU;AAMf,cAAU,MAAM,KAAK,SAAS,EAAE,KAAK,MAAM,KAAK,MAAM,MAAM,CAAC,CAAC;AAAA,EAChE;AAAA,EAEA,MAAc,WAAW;AACvB,aAAS,IAAI,GAAG,IAAI,KAAK,aAAa,WAAW,GAAG,KAAK;AACvD,UAAI;AACF,eAAO,MAAM,KAAK,IAAI;AAAA,MACxB,SAAS,OAAO;AACd,YAAI,iBAAiB,UAAU;AAC7B,gBAAM,gBAAgB,KAAK,aAAa,kBAAkB,CAAC;AAE3D,cAAI,KAAK,aAAa,aAAa,KAAK,CAAC,MAAM,WAAW;AACxD,iBAAK,UAAU,EAAE,OAAO,aAAa,MAAM,CAAC;AAC5C,kBAAM;AAAA,UACR,WAAW,MAAM,KAAK,aAAa,UAAU;AAC3C,iBAAK,UAAU,EAAE,OAAO,aAAa,MAAM,CAAC;AAC5C,kBAAM,IAAI,mBAAmB;AAAA,cAC3B,SAAS,oCAAoC,KAAK,aAAa,WAAW,CAAC;AAAA,cAC3E,SAAS,EAAE,WAAW,MAAM;AAAA,YAC9B,CAAC;AAAA,UACH,OAAO;AACL,iBAAK,UAAU,EAAE,OAAO,aAAa,KAAK,CAAC;AAC3C,iBAAK,OAAO;AAAA,cACV,EAAE,KAAK,KAAK,KAAK,OAAO,SAAS,IAAI,GAAG,MAAM;AAAA,cAC9C,2CAA2C,aAAa;AAAA,YAC1D;AAAA,UACF;AAEA,cAAI,gBAAgB,GAAG;AACrB,kBAAM,MAAM,aAAa;AAAA,UAC3B;AAAA,QACF,OAAO;AACL,eAAK,UAAU,EAAE,OAAO,QAAQ,KAAK,GAAG,aAAa,MAAM,CAAC;AAC5D,gBAAM;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,UAAU,EAAE,OAAO,YAAY,GAA2C;AAChF,SAAK,KAAK,KAAK,SAAS;AAAA,MACtB,MAAM;AAAA,MACN,WAAW,KAAK,IAAI;AAAA,MACpB,OAAO,KAAK,KAAK;AAAA,MACjB;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAgB,YAAY;AAE1B,UAAM,cAAc,KAAK,oBAAoB;AAC7C,UAAM,SAAS,YAAY,UAAU;AAErC,QAAI;AACF,aAAO,MAAM;AACX,cAAM,EAAE,MAAM,MAAM,IAAI,MAAM,OAAO,KAAK;AAC1C,YAAI,KAAM;AACV,aAAK,UAAU,KAAK;AAAA,MACtB;AAAA,IACF,SAAS,OAAO;AACd,WAAK,OAAO,MAAM,gCAAgC,KAAK;AAAA,IACzD,UAAE;AACA,aAAO,YAAY;AAAA,IACrB;AAAA,EACF;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,MAAM,SAAS,0BAAmC;AACtD,YAAM,UAAsB;AAAA,QAC1B,MAAM;AAAA,QACN,WAAW,KAAK,IAAI;AAAA,QACpB,WAAW,MAAM;AAAA,QACjB,YAAY;AAAA,QACZ,OAAO,KAAK,KAAK;AAAA,QACjB,iBAAiB,KAAK,MAAM,MAAM,iBAAkB,gBAAgB,GAAI;AAAA,QACxE,UAAU;AAAA,MACZ;AACA,WAAK,KAAK,KAAK,qBAAqB,OAAO;AAAA,IAC7C;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA,EAIA,kBAAkB,aAAyC;AACzD,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,oBAAoB;AAClB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA;AAAA,EAGA,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,QAAI,KAAK,oBAAoB,MAAM,eAAe,KAAK,kBAAkB;AACvE,UAAI,CAAC,KAAK,WAAW;AACnB,aAAK,YAAY,IAAI,eAAe,MAAM,YAAY,KAAK,gBAAgB;AAAA,MAC7E;AAAA,IACF;AAEA,QAAI,KAAK,WAAW;AAClB,YAAM,SAAS,KAAK,UAAU,KAAK,KAAK;AACxC,iBAAWC,UAAS,QAAQ;AAC1B,aAAK,MAAM,IAAIA,MAAK;AAAA,MACtB;AAAA,IACF,OAAO;AACL,WAAK,MAAM,IAAI,KAAK;AAAA,IACtB;AAAA,EACF;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,aAAa,cAAc;AAAA,EAC5C;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA6C;AAC3C,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAkB;AACrC,WAAO;AAAA,EACT;AACF;","names":["SpeechEventType","frame"]}
|
package/dist/vad.cjs
CHANGED
|
@@ -130,6 +130,22 @@ class VADStream {
|
|
|
130
130
|
}
|
|
131
131
|
}
|
|
132
132
|
}
|
|
133
|
+
/**
|
|
134
|
+
* Safely send a VAD event to the output stream, handling writer release errors during shutdown.
|
|
135
|
+
* @returns true if the event was sent, false if the stream is closing
|
|
136
|
+
* @throws Error if an unexpected error occurs
|
|
137
|
+
*/
|
|
138
|
+
sendVADEvent(event) {
|
|
139
|
+
if (this.closed) {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
try {
|
|
143
|
+
this.outputWriter.write(event);
|
|
144
|
+
return true;
|
|
145
|
+
} catch (e) {
|
|
146
|
+
throw e;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
133
149
|
updateInputStream(audioStream) {
|
|
134
150
|
this.deferredInputStream.setSource(audioStream);
|
|
135
151
|
}
|
package/dist/vad.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type {\n ReadableStream,\n ReadableStreamDefaultReader,\n WritableStreamDefaultWriter,\n} from 'node:stream/web';\nimport { log } from './log.js';\nimport type { VADMetrics } from './metrics/base.js';\nimport { DeferredReadableStream } from './stream/deferred_stream.js';\nimport { IdentityTransform } from './stream/identity_transform.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n METRICS_COLLECTED,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the speech segment. */\n speechDuration: number;\n /** Duration of the silence segment. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n /** Threshold used to detect silence. */\n rawAccumulatedSilence: number;\n /** Threshold used to detect speech. */\n rawAccumulatedSpeech: number;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport type VADCallbacks = {\n ['metrics_collected']: (metrics: VADMetrics) => void;\n};\n\nexport abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {\n #capabilities: VADCapabilities;\n abstract label: string;\n\n constructor(capabilities: VADCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new IdentityTransform<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected output = new IdentityTransform<VADEvent>();\n protected inputWriter: WritableStreamDefaultWriter<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;\n protected inputReader: ReadableStreamDefaultReader<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;\n protected outputWriter: WritableStreamDefaultWriter<VADEvent>;\n protected outputReader: ReadableStreamDefaultReader<VADEvent>;\n protected closed = false;\n protected inputClosed = false;\n\n #vad: VAD;\n #lastActivityTime = BigInt(0);\n private logger = log();\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n\n private metricsStream: ReadableStream<VADEvent>;\n constructor(vad: VAD) {\n this.#vad = vad;\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n\n this.inputWriter = this.input.writable.getWriter();\n this.inputReader = this.input.readable.getReader();\n this.outputWriter = this.output.writable.getWriter();\n\n const [outputStream, metricsStream] = this.output.readable.tee();\n this.metricsStream = metricsStream;\n this.outputReader = outputStream.getReader();\n\n this.pumpDeferredStream();\n this.monitorMetrics();\n }\n\n /**\n * Reads from the deferred input stream and forwards chunks to the input writer.\n *\n * Note: we can't just do this.deferredInputStream.stream.pipeTo(this.input.writable)\n * because the inputWriter locks the this.input.writable stream. All writes must go through\n * the inputWriter.\n */\n private async pumpDeferredStream() {\n const reader = this.deferredInputStream.stream.getReader();\n try {\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n await this.inputWriter.write(value);\n }\n } catch (e) {\n this.logger.error(`Error pumping deferred stream: ${e}`);\n throw e;\n } finally {\n reader.releaseLock();\n }\n }\n\n protected async monitorMetrics() {\n let inferenceDurationTotalMs = 0;\n let inferenceCount = 0;\n const metricsReader = this.metricsStream.getReader();\n while (true) {\n const { done, value } = await metricsReader.read();\n if (done) {\n break;\n }\n switch (value.type) {\n case VADEventType.START_OF_SPEECH:\n inferenceCount++;\n if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {\n this.#vad.emit('metrics_collected', {\n type: 'vad_metrics',\n timestamp: Date.now(),\n idleTimeMs: Math.trunc(\n Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),\n ),\n inferenceDurationTotalMs,\n inferenceCount,\n label: this.#vad.label,\n });\n\n inferenceCount = 0;\n inferenceDurationTotalMs = 0;\n }\n break;\n case VADEventType.INFERENCE_DONE:\n inferenceDurationTotalMs += Math.round(value.inferenceDuration);\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n case VADEventType.END_OF_SPEECH:\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n }\n }\n }\n\n updateInputStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputStream() {\n this.deferredInputStream.detachSource();\n }\n\n /** @deprecated Use `updateInputStream` instead */\n pushFrame(frame: AudioFrame) {\n // TODO(AJS-395): remove this method\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputWriter.write(frame);\n }\n\n flush() {\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputWriter.write(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputClosed = true;\n this.input.writable.close();\n }\n\n async next(): Promise<IteratorResult<VADEvent>> {\n return this.outputReader.read().then(({ done, value }) => {\n if (done) {\n return { done: true, value: undefined };\n }\n return { done: false, value };\n });\n }\n\n close() {\n this.outputWriter.releaseLock();\n this.outputReader.cancel();\n this.output.writable.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAA6B;AAM7B,iBAAoB;AAEpB,6BAAuC;AACvC,gCAAkC;AAE3B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAJU,SAAAA;AAAA,GAAA;AAiDL,MAAe,YAAa,gCAAsD;AAAA,EACvF;AAAA,EAGA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,4CAAgE;AAAA,EAC5E,SAAS,IAAI,4CAA4B;AAAA,EACzC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,SAAS;AAAA,EACT,cAAc;AAAA,EAExB;AAAA,EACA,oBAAoB,OAAO,CAAC;AAAA,EACpB,aAAS,gBAAI;AAAA,EACb;AAAA,EAEA;AAAA,EACR,YAAY,KAAU;AACpB,SAAK,OAAO;AACZ,SAAK,sBAAsB,IAAI,8CAAmC;AAElE,SAAK,cAAc,KAAK,MAAM,SAAS,UAAU;AACjD,SAAK,cAAc,KAAK,MAAM,SAAS,UAAU;AACjD,SAAK,eAAe,KAAK,OAAO,SAAS,UAAU;AAEnD,UAAM,CAAC,cAAc,aAAa,IAAI,KAAK,OAAO,SAAS,IAAI;AAC/D,SAAK,gBAAgB;AACrB,SAAK,eAAe,aAAa,UAAU;AAE3C,SAAK,mBAAmB;AACxB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,MAAc,qBAAqB;AACjC,UAAM,SAAS,KAAK,oBAAoB,OAAO,UAAU;AACzD,QAAI;AACF,aAAO,MAAM;AACX,cAAM,EAAE,MAAM,MAAM,IAAI,MAAM,OAAO,KAAK;AAC1C,YAAI,KAAM;AACV,cAAM,KAAK,YAAY,MAAM,KAAK;AAAA,MACpC;AAAA,IACF,SAAS,GAAG;AACV,WAAK,OAAO,MAAM,kCAAkC,CAAC,EAAE;AACvD,YAAM;AAAA,IACR,UAAE;AACA,aAAO,YAAY;AAAA,IACrB;AAAA,EACF;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,QAAI,2BAA2B;AAC/B,QAAI,iBAAiB;AACrB,UAAM,gBAAgB,KAAK,cAAc,UAAU;AACnD,WAAO,MAAM;AACX,YAAM,EAAE,MAAM,MAAM,IAAI,MAAM,cAAc,KAAK;AACjD,UAAI,MAAM;AACR;AAAA,MACF;AACA,cAAQ,MAAM,MAAM;AAAA,QAClB,KAAK;AACH;AACA,cAAI,kBAAkB,IAAI,KAAK,KAAK,aAAa,gBAAgB;AAC/D,iBAAK,KAAK,KAAK,qBAAqB;AAAA,cAClC,MAAM;AAAA,cACN,WAAW,KAAK,IAAI;AAAA,cACpB,YAAY,KAAK;AAAA,gBACf,QAAQ,QAAQ,OAAO,OAAO,IAAI,KAAK,qBAAqB,OAAO,GAAO,CAAC;AAAA,cAC7E;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,KAAK,KAAK;AAAA,YACnB,CAAC;AAED,6BAAiB;AACjB,uCAA2B;AAAA,UAC7B;AACA;AAAA,QACF,KAAK;AACH,sCAA4B,KAAK,MAAM,MAAM,iBAAiB;AAC9D,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,QACF,KAAK;AACH,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AAAA,EAEA,kBAAkB,aAAyC;AACzD,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,oBAAoB;AAClB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA;AAAA,EAGA,UAAU,OAAmB;AAE3B,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,YAAY,MAAM,KAAK;AAAA,EAC9B;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,YAAY,MAAM,UAAU,cAAc;AAAA,EACjD;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,cAAc;AACnB,SAAK,MAAM,SAAS,MAAM;AAAA,EAC5B;AAAA,EAEA,MAAM,OAA0C;AAC9C,WAAO,KAAK,aAAa,KAAK,EAAE,KAAK,CAAC,EAAE,MAAM,MAAM,MAAM;AACxD,UAAI,MAAM;AACR,eAAO,EAAE,MAAM,MAAM,OAAO,OAAU;AAAA,MACxC;AACA,aAAO,EAAE,MAAM,OAAO,MAAM;AAAA,IAC9B,CAAC;AAAA,EACH;AAAA,EAEA,QAAQ;AACN,SAAK,aAAa,YAAY;AAC9B,SAAK,aAAa,OAAO;AACzB,SAAK,OAAO,SAAS,MAAM;AAC3B,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
|
|
1
|
+
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type {\n ReadableStream,\n ReadableStreamDefaultReader,\n WritableStreamDefaultWriter,\n} from 'node:stream/web';\nimport { log } from './log.js';\nimport type { VADMetrics } from './metrics/base.js';\nimport { DeferredReadableStream } from './stream/deferred_stream.js';\nimport { IdentityTransform } from './stream/identity_transform.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n METRICS_COLLECTED,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the speech segment. */\n speechDuration: number;\n /** Duration of the silence segment. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n /** Threshold used to detect silence. */\n rawAccumulatedSilence: number;\n /** Threshold used to detect speech. */\n rawAccumulatedSpeech: number;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport type VADCallbacks = {\n ['metrics_collected']: (metrics: VADMetrics) => void;\n};\n\nexport abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {\n #capabilities: VADCapabilities;\n abstract label: string;\n\n constructor(capabilities: VADCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new IdentityTransform<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected output = new IdentityTransform<VADEvent>();\n protected inputWriter: WritableStreamDefaultWriter<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;\n protected inputReader: ReadableStreamDefaultReader<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;\n protected outputWriter: WritableStreamDefaultWriter<VADEvent>;\n protected outputReader: ReadableStreamDefaultReader<VADEvent>;\n protected closed = false;\n protected inputClosed = false;\n\n #vad: VAD;\n #lastActivityTime = BigInt(0);\n private logger = log();\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n\n private metricsStream: ReadableStream<VADEvent>;\n constructor(vad: VAD) {\n this.#vad = vad;\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n\n this.inputWriter = this.input.writable.getWriter();\n this.inputReader = this.input.readable.getReader();\n this.outputWriter = this.output.writable.getWriter();\n\n const [outputStream, metricsStream] = this.output.readable.tee();\n this.metricsStream = metricsStream;\n this.outputReader = outputStream.getReader();\n\n this.pumpDeferredStream();\n this.monitorMetrics();\n }\n\n /**\n * Reads from the deferred input stream and forwards chunks to the input writer.\n *\n * Note: we can't just do this.deferredInputStream.stream.pipeTo(this.input.writable)\n * because the inputWriter locks the this.input.writable stream. All writes must go through\n * the inputWriter.\n */\n private async pumpDeferredStream() {\n const reader = this.deferredInputStream.stream.getReader();\n try {\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n await this.inputWriter.write(value);\n }\n } catch (e) {\n this.logger.error(`Error pumping deferred stream: ${e}`);\n throw e;\n } finally {\n reader.releaseLock();\n }\n }\n\n protected async monitorMetrics() {\n let inferenceDurationTotalMs = 0;\n let inferenceCount = 0;\n const metricsReader = this.metricsStream.getReader();\n while (true) {\n const { done, value } = await metricsReader.read();\n if (done) {\n break;\n }\n switch (value.type) {\n case VADEventType.START_OF_SPEECH:\n inferenceCount++;\n if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {\n this.#vad.emit('metrics_collected', {\n type: 'vad_metrics',\n timestamp: Date.now(),\n idleTimeMs: Math.trunc(\n Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),\n ),\n inferenceDurationTotalMs,\n inferenceCount,\n label: this.#vad.label,\n });\n\n inferenceCount = 0;\n inferenceDurationTotalMs = 0;\n }\n break;\n case VADEventType.INFERENCE_DONE:\n inferenceDurationTotalMs += Math.round(value.inferenceDuration);\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n case VADEventType.END_OF_SPEECH:\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n }\n }\n }\n\n /**\n * Safely send a VAD event to the output stream, handling writer release errors during shutdown.\n * @returns true if the event was sent, false if the stream is closing\n * @throws Error if an unexpected error occurs\n */\n protected sendVADEvent(event: VADEvent): boolean {\n if (this.closed) {\n return false;\n }\n\n try {\n this.outputWriter.write(event);\n return true;\n } catch (e) {\n throw e;\n }\n }\n\n updateInputStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputStream() {\n this.deferredInputStream.detachSource();\n }\n\n /** @deprecated Use `updateInputStream` instead */\n pushFrame(frame: AudioFrame) {\n // TODO(AJS-395): remove this method\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputWriter.write(frame);\n }\n\n flush() {\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputWriter.write(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputClosed = true;\n this.input.writable.close();\n }\n\n async next(): Promise<IteratorResult<VADEvent>> {\n return this.outputReader.read().then(({ done, value }) => {\n if (done) {\n return { done: true, value: undefined };\n }\n return { done: false, value };\n });\n }\n\n close() {\n this.outputWriter.releaseLock();\n this.outputReader.cancel();\n this.output.writable.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAA6B;AAM7B,iBAAoB;AAEpB,6BAAuC;AACvC,gCAAkC;AAE3B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAJU,SAAAA;AAAA,GAAA;AAiDL,MAAe,YAAa,gCAAsD;AAAA,EACvF;AAAA,EAGA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,4CAAgE;AAAA,EAC5E,SAAS,IAAI,4CAA4B;AAAA,EACzC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,SAAS;AAAA,EACT,cAAc;AAAA,EAExB;AAAA,EACA,oBAAoB,OAAO,CAAC;AAAA,EACpB,aAAS,gBAAI;AAAA,EACb;AAAA,EAEA;AAAA,EACR,YAAY,KAAU;AACpB,SAAK,OAAO;AACZ,SAAK,sBAAsB,IAAI,8CAAmC;AAElE,SAAK,cAAc,KAAK,MAAM,SAAS,UAAU;AACjD,SAAK,cAAc,KAAK,MAAM,SAAS,UAAU;AACjD,SAAK,eAAe,KAAK,OAAO,SAAS,UAAU;AAEnD,UAAM,CAAC,cAAc,aAAa,IAAI,KAAK,OAAO,SAAS,IAAI;AAC/D,SAAK,gBAAgB;AACrB,SAAK,eAAe,aAAa,UAAU;AAE3C,SAAK,mBAAmB;AACxB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,MAAc,qBAAqB;AACjC,UAAM,SAAS,KAAK,oBAAoB,OAAO,UAAU;AACzD,QAAI;AACF,aAAO,MAAM;AACX,cAAM,EAAE,MAAM,MAAM,IAAI,MAAM,OAAO,KAAK;AAC1C,YAAI,KAAM;AACV,cAAM,KAAK,YAAY,MAAM,KAAK;AAAA,MACpC;AAAA,IACF,SAAS,GAAG;AACV,WAAK,OAAO,MAAM,kCAAkC,CAAC,EAAE;AACvD,YAAM;AAAA,IACR,UAAE;AACA,aAAO,YAAY;AAAA,IACrB;AAAA,EACF;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,QAAI,2BAA2B;AAC/B,QAAI,iBAAiB;AACrB,UAAM,gBAAgB,KAAK,cAAc,UAAU;AACnD,WAAO,MAAM;AACX,YAAM,EAAE,MAAM,MAAM,IAAI,MAAM,cAAc,KAAK;AACjD,UAAI,MAAM;AACR;AAAA,MACF;AACA,cAAQ,MAAM,MAAM;AAAA,QAClB,KAAK;AACH;AACA,cAAI,kBAAkB,IAAI,KAAK,KAAK,aAAa,gBAAgB;AAC/D,iBAAK,KAAK,KAAK,qBAAqB;AAAA,cAClC,MAAM;AAAA,cACN,WAAW,KAAK,IAAI;AAAA,cACpB,YAAY,KAAK;AAAA,gBACf,QAAQ,QAAQ,OAAO,OAAO,IAAI,KAAK,qBAAqB,OAAO,GAAO,CAAC;AAAA,cAC7E;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,KAAK,KAAK;AAAA,YACnB,CAAC;AAED,6BAAiB;AACjB,uCAA2B;AAAA,UAC7B;AACA;AAAA,QACF,KAAK;AACH,sCAA4B,KAAK,MAAM,MAAM,iBAAiB;AAC9D,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,QACF,KAAK;AACH,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOU,aAAa,OAA0B;AAC/C,QAAI,KAAK,QAAQ;AACf,aAAO;AAAA,IACT;AAEA,QAAI;AACF,WAAK,aAAa,MAAM,KAAK;AAC7B,aAAO;AAAA,IACT,SAAS,GAAG;AACV,YAAM;AAAA,IACR;AAAA,EACF;AAAA,EAEA,kBAAkB,aAAyC;AACzD,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,oBAAoB;AAClB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA;AAAA,EAGA,UAAU,OAAmB;AAE3B,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,YAAY,MAAM,KAAK;AAAA,EAC9B;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,YAAY,MAAM,UAAU,cAAc;AAAA,EACjD;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,cAAc;AACnB,SAAK,MAAM,SAAS,MAAM;AAAA,EAC5B;AAAA,EAEA,MAAM,OAA0C;AAC9C,WAAO,KAAK,aAAa,KAAK,EAAE,KAAK,CAAC,EAAE,MAAM,MAAM,MAAM;AACxD,UAAI,MAAM;AACR,eAAO,EAAE,MAAM,MAAM,OAAO,OAAU;AAAA,MACxC;AACA,aAAO,EAAE,MAAM,OAAO,MAAM;AAAA,IAC9B,CAAC;AAAA,EACH;AAAA,EAEA,QAAQ;AACN,SAAK,aAAa,YAAY;AAC9B,SAAK,aAAa,OAAO;AACzB,SAAK,OAAO,SAAS,MAAM;AAC3B,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
|
package/dist/vad.d.cts
CHANGED
|
@@ -84,6 +84,12 @@ export declare abstract class VADStream implements AsyncIterableIterator<VADEven
|
|
|
84
84
|
*/
|
|
85
85
|
private pumpDeferredStream;
|
|
86
86
|
protected monitorMetrics(): Promise<void>;
|
|
87
|
+
/**
|
|
88
|
+
* Safely send a VAD event to the output stream, handling writer release errors during shutdown.
|
|
89
|
+
* @returns true if the event was sent, false if the stream is closing
|
|
90
|
+
* @throws Error if an unexpected error occurs
|
|
91
|
+
*/
|
|
92
|
+
protected sendVADEvent(event: VADEvent): boolean;
|
|
87
93
|
updateInputStream(audioStream: ReadableStream<AudioFrame>): void;
|
|
88
94
|
detachInputStream(): void;
|
|
89
95
|
/** @deprecated Use `updateInputStream` instead */
|
package/dist/vad.d.ts
CHANGED
|
@@ -84,6 +84,12 @@ export declare abstract class VADStream implements AsyncIterableIterator<VADEven
|
|
|
84
84
|
*/
|
|
85
85
|
private pumpDeferredStream;
|
|
86
86
|
protected monitorMetrics(): Promise<void>;
|
|
87
|
+
/**
|
|
88
|
+
* Safely send a VAD event to the output stream, handling writer release errors during shutdown.
|
|
89
|
+
* @returns true if the event was sent, false if the stream is closing
|
|
90
|
+
* @throws Error if an unexpected error occurs
|
|
91
|
+
*/
|
|
92
|
+
protected sendVADEvent(event: VADEvent): boolean;
|
|
87
93
|
updateInputStream(audioStream: ReadableStream<AudioFrame>): void;
|
|
88
94
|
detachInputStream(): void;
|
|
89
95
|
/** @deprecated Use `updateInputStream` instead */
|
package/dist/vad.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":";AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EACV,cAAc,EACd,2BAA2B,EAC3B,2BAA2B,EAC5B,MAAM,iBAAiB,CAAC;AAEzB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAEpD,OAAO,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAC;AAEnE,oBAAY,YAAY;IACtB,eAAe,IAAA;IACf,cAAc,IAAA;IACd,aAAa,IAAA;IACb,iBAAiB,IAAA;CAClB;AAED,MAAM,WAAW,QAAQ;IACvB,oFAAoF;IACpF,IAAI,EAAE,YAAY,CAAC;IACnB;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IACrB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,sCAAsC;IACtC,cAAc,EAAE,MAAM,CAAC;IACvB,uCAAuC;IACvC,eAAe,EAAE,MAAM,CAAC;IACxB;;;;;;;OAOG;IACH,MAAM,EAAE,UAAU,EAAE,CAAC;IACrB,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB,0FAA0F;IAC1F,iBAAiB,EAAE,MAAM,CAAC;IAC1B,2DAA2D;IAC3D,QAAQ,EAAE,OAAO,CAAC;IAClB,wCAAwC;IACxC,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uCAAuC;IACvC,oBAAoB,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,eAAe;IAC9B,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,MAAM,YAAY,GAAG;IACzB,CAAC,mBAAmB,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,KAAK,IAAI,CAAC;CACtD,CAAC;kCAE2D,aAAa,YAAY,CAAC;AAAvF,8BAAsB,GAAI,SAAQ,QAAsD;;IAEtF,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAEX,YAAY,EAAE,eAAe;IAKzC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,SAAS;CAC7B;AAED,8BAAsB,SAAU,YAAW,qBAAqB,CAAC,QAAQ,CAAC;;IACxE,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,SAAS,CAAC,KAAK,kEAAyE;IACxF,SAAS,CAAC,MAAM,8BAAqC;IACrD,SAAS,CAAC,WAAW,EAAE,2BAA2B,CAAC,UAAU,GAAG,OAAO,SAAS,CAAC,cAAc,CAAC,CAAC;IACjG,SAAS,CAAC,WAAW,EAAE,2BAA2B,CAAC,UAAU,GAAG,OAAO,SAAS,CAAC,cAAc,CAAC,CAAC;IACjG,SAAS,CAAC,YAAY,EAAE,2BAA2B,CAAC,QAAQ,CAAC,CAAC;IAC9D,SAAS,CAAC,YAAY,EAAE,2BAA2B,CAAC,QAAQ,CAAC,CAAC;IAC9D,SAAS,CAAC,MAAM,UAAS;IACzB,SAAS,CAAC,WAAW,UAAS;IAI9B,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,mBAAmB,CAAqC;IAEhE,OAAO,CAAC,aAAa,CAA2B;gBACpC,GAAG,EAAE,GAAG;IAgBpB;;;;;;OAMG;YACW,kBAAkB;cAgBhB,cAAc;IAuC9B,iBAAiB,CAAC,WAAW,EAAE,cAAc,CAAC,UAAU,CAAC;IAIzD,iBAAiB;IAIjB,kDAAkD;IAClD,SAAS,CAAC,KAAK,EAAE,UAAU;IAW3B,KAAK;IAUL,QAAQ;IAWF,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC;IAS/C,KAAK;IAOL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,SAAS;CAGpC"}
|
|
1
|
+
{"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":";AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EACV,cAAc,EACd,2BAA2B,EAC3B,2BAA2B,EAC5B,MAAM,iBAAiB,CAAC;AAEzB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAEpD,OAAO,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAC;AAEnE,oBAAY,YAAY;IACtB,eAAe,IAAA;IACf,cAAc,IAAA;IACd,aAAa,IAAA;IACb,iBAAiB,IAAA;CAClB;AAED,MAAM,WAAW,QAAQ;IACvB,oFAAoF;IACpF,IAAI,EAAE,YAAY,CAAC;IACnB;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IACrB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,sCAAsC;IACtC,cAAc,EAAE,MAAM,CAAC;IACvB,uCAAuC;IACvC,eAAe,EAAE,MAAM,CAAC;IACxB;;;;;;;OAOG;IACH,MAAM,EAAE,UAAU,EAAE,CAAC;IACrB,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB,0FAA0F;IAC1F,iBAAiB,EAAE,MAAM,CAAC;IAC1B,2DAA2D;IAC3D,QAAQ,EAAE,OAAO,CAAC;IAClB,wCAAwC;IACxC,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uCAAuC;IACvC,oBAAoB,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,eAAe;IAC9B,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,MAAM,YAAY,GAAG;IACzB,CAAC,mBAAmB,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,KAAK,IAAI,CAAC;CACtD,CAAC;kCAE2D,aAAa,YAAY,CAAC;AAAvF,8BAAsB,GAAI,SAAQ,QAAsD;;IAEtF,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAEX,YAAY,EAAE,eAAe;IAKzC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,SAAS;CAC7B;AAED,8BAAsB,SAAU,YAAW,qBAAqB,CAAC,QAAQ,CAAC;;IACxE,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,SAAS,CAAC,KAAK,kEAAyE;IACxF,SAAS,CAAC,MAAM,8BAAqC;IACrD,SAAS,CAAC,WAAW,EAAE,2BAA2B,CAAC,UAAU,GAAG,OAAO,SAAS,CAAC,cAAc,CAAC,CAAC;IACjG,SAAS,CAAC,WAAW,EAAE,2BAA2B,CAAC,UAAU,GAAG,OAAO,SAAS,CAAC,cAAc,CAAC,CAAC;IACjG,SAAS,CAAC,YAAY,EAAE,2BAA2B,CAAC,QAAQ,CAAC,CAAC;IAC9D,SAAS,CAAC,YAAY,EAAE,2BAA2B,CAAC,QAAQ,CAAC,CAAC;IAC9D,SAAS,CAAC,MAAM,UAAS;IACzB,SAAS,CAAC,WAAW,UAAS;IAI9B,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,mBAAmB,CAAqC;IAEhE,OAAO,CAAC,aAAa,CAA2B;gBACpC,GAAG,EAAE,GAAG;IAgBpB;;;;;;OAMG;YACW,kBAAkB;cAgBhB,cAAc;IAuC9B;;;;OAIG;IACH,SAAS,CAAC,YAAY,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO;IAahD,iBAAiB,CAAC,WAAW,EAAE,cAAc,CAAC,UAAU,CAAC;IAIzD,iBAAiB;IAIjB,kDAAkD;IAClD,SAAS,CAAC,KAAK,EAAE,UAAU;IAW3B,KAAK;IAUL,QAAQ;IAWF,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC;IAS/C,KAAK;IAOL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,SAAS;CAGpC"}
|
package/dist/vad.js
CHANGED
|
@@ -105,6 +105,22 @@ class VADStream {
|
|
|
105
105
|
}
|
|
106
106
|
}
|
|
107
107
|
}
|
|
108
|
+
/**
|
|
109
|
+
* Safely send a VAD event to the output stream, handling writer release errors during shutdown.
|
|
110
|
+
* @returns true if the event was sent, false if the stream is closing
|
|
111
|
+
* @throws Error if an unexpected error occurs
|
|
112
|
+
*/
|
|
113
|
+
sendVADEvent(event) {
|
|
114
|
+
if (this.closed) {
|
|
115
|
+
return false;
|
|
116
|
+
}
|
|
117
|
+
try {
|
|
118
|
+
this.outputWriter.write(event);
|
|
119
|
+
return true;
|
|
120
|
+
} catch (e) {
|
|
121
|
+
throw e;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
108
124
|
updateInputStream(audioStream) {
|
|
109
125
|
this.deferredInputStream.setSource(audioStream);
|
|
110
126
|
}
|
package/dist/vad.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type {\n ReadableStream,\n ReadableStreamDefaultReader,\n WritableStreamDefaultWriter,\n} from 'node:stream/web';\nimport { log } from './log.js';\nimport type { VADMetrics } from './metrics/base.js';\nimport { DeferredReadableStream } from './stream/deferred_stream.js';\nimport { IdentityTransform } from './stream/identity_transform.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n METRICS_COLLECTED,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the speech segment. */\n speechDuration: number;\n /** Duration of the silence segment. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n /** Threshold used to detect silence. */\n rawAccumulatedSilence: number;\n /** Threshold used to detect speech. */\n rawAccumulatedSpeech: number;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport type VADCallbacks = {\n ['metrics_collected']: (metrics: VADMetrics) => void;\n};\n\nexport abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {\n #capabilities: VADCapabilities;\n abstract label: string;\n\n constructor(capabilities: VADCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new IdentityTransform<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected output = new IdentityTransform<VADEvent>();\n protected inputWriter: WritableStreamDefaultWriter<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;\n protected inputReader: ReadableStreamDefaultReader<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;\n protected outputWriter: WritableStreamDefaultWriter<VADEvent>;\n protected outputReader: ReadableStreamDefaultReader<VADEvent>;\n protected closed = false;\n protected inputClosed = false;\n\n #vad: VAD;\n #lastActivityTime = BigInt(0);\n private logger = log();\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n\n private metricsStream: ReadableStream<VADEvent>;\n constructor(vad: VAD) {\n this.#vad = vad;\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n\n this.inputWriter = this.input.writable.getWriter();\n this.inputReader = this.input.readable.getReader();\n this.outputWriter = this.output.writable.getWriter();\n\n const [outputStream, metricsStream] = this.output.readable.tee();\n this.metricsStream = metricsStream;\n this.outputReader = outputStream.getReader();\n\n this.pumpDeferredStream();\n this.monitorMetrics();\n }\n\n /**\n * Reads from the deferred input stream and forwards chunks to the input writer.\n *\n * Note: we can't just do this.deferredInputStream.stream.pipeTo(this.input.writable)\n * because the inputWriter locks the this.input.writable stream. All writes must go through\n * the inputWriter.\n */\n private async pumpDeferredStream() {\n const reader = this.deferredInputStream.stream.getReader();\n try {\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n await this.inputWriter.write(value);\n }\n } catch (e) {\n this.logger.error(`Error pumping deferred stream: ${e}`);\n throw e;\n } finally {\n reader.releaseLock();\n }\n }\n\n protected async monitorMetrics() {\n let inferenceDurationTotalMs = 0;\n let inferenceCount = 0;\n const metricsReader = this.metricsStream.getReader();\n while (true) {\n const { done, value } = await metricsReader.read();\n if (done) {\n break;\n }\n switch (value.type) {\n case VADEventType.START_OF_SPEECH:\n inferenceCount++;\n if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {\n this.#vad.emit('metrics_collected', {\n type: 'vad_metrics',\n timestamp: Date.now(),\n idleTimeMs: Math.trunc(\n Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),\n ),\n inferenceDurationTotalMs,\n inferenceCount,\n label: this.#vad.label,\n });\n\n inferenceCount = 0;\n inferenceDurationTotalMs = 0;\n }\n break;\n case VADEventType.INFERENCE_DONE:\n inferenceDurationTotalMs += Math.round(value.inferenceDuration);\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n case VADEventType.END_OF_SPEECH:\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n }\n }\n }\n\n updateInputStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputStream() {\n this.deferredInputStream.detachSource();\n }\n\n /** @deprecated Use `updateInputStream` instead */\n pushFrame(frame: AudioFrame) {\n // TODO(AJS-395): remove this method\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputWriter.write(frame);\n }\n\n flush() {\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputWriter.write(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputClosed = true;\n this.input.writable.close();\n }\n\n async next(): Promise<IteratorResult<VADEvent>> {\n return this.outputReader.read().then(({ done, value }) => {\n if (done) {\n return { done: true, value: undefined };\n }\n return { done: false, value };\n });\n }\n\n close() {\n this.outputWriter.releaseLock();\n this.outputReader.cancel();\n this.output.writable.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":"AAKA,SAAS,oBAAoB;AAM7B,SAAS,WAAW;AAEpB,SAAS,8BAA8B;AACvC,SAAS,yBAAyB;AAE3B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAJU,SAAAA;AAAA,GAAA;AAiDL,MAAe,YAAa,aAAsD;AAAA,EACvF;AAAA,EAGA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,kBAAgE;AAAA,EAC5E,SAAS,IAAI,kBAA4B;AAAA,EACzC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,SAAS;AAAA,EACT,cAAc;AAAA,EAExB;AAAA,EACA,oBAAoB,OAAO,CAAC;AAAA,EACpB,SAAS,IAAI;AAAA,EACb;AAAA,EAEA;AAAA,EACR,YAAY,KAAU;AACpB,SAAK,OAAO;AACZ,SAAK,sBAAsB,IAAI,uBAAmC;AAElE,SAAK,cAAc,KAAK,MAAM,SAAS,UAAU;AACjD,SAAK,cAAc,KAAK,MAAM,SAAS,UAAU;AACjD,SAAK,eAAe,KAAK,OAAO,SAAS,UAAU;AAEnD,UAAM,CAAC,cAAc,aAAa,IAAI,KAAK,OAAO,SAAS,IAAI;AAC/D,SAAK,gBAAgB;AACrB,SAAK,eAAe,aAAa,UAAU;AAE3C,SAAK,mBAAmB;AACxB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,MAAc,qBAAqB;AACjC,UAAM,SAAS,KAAK,oBAAoB,OAAO,UAAU;AACzD,QAAI;AACF,aAAO,MAAM;AACX,cAAM,EAAE,MAAM,MAAM,IAAI,MAAM,OAAO,KAAK;AAC1C,YAAI,KAAM;AACV,cAAM,KAAK,YAAY,MAAM,KAAK;AAAA,MACpC;AAAA,IACF,SAAS,GAAG;AACV,WAAK,OAAO,MAAM,kCAAkC,CAAC,EAAE;AACvD,YAAM;AAAA,IACR,UAAE;AACA,aAAO,YAAY;AAAA,IACrB;AAAA,EACF;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,QAAI,2BAA2B;AAC/B,QAAI,iBAAiB;AACrB,UAAM,gBAAgB,KAAK,cAAc,UAAU;AACnD,WAAO,MAAM;AACX,YAAM,EAAE,MAAM,MAAM,IAAI,MAAM,cAAc,KAAK;AACjD,UAAI,MAAM;AACR;AAAA,MACF;AACA,cAAQ,MAAM,MAAM;AAAA,QAClB,KAAK;AACH;AACA,cAAI,kBAAkB,IAAI,KAAK,KAAK,aAAa,gBAAgB;AAC/D,iBAAK,KAAK,KAAK,qBAAqB;AAAA,cAClC,MAAM;AAAA,cACN,WAAW,KAAK,IAAI;AAAA,cACpB,YAAY,KAAK;AAAA,gBACf,QAAQ,QAAQ,OAAO,OAAO,IAAI,KAAK,qBAAqB,OAAO,GAAO,CAAC;AAAA,cAC7E;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,KAAK,KAAK;AAAA,YACnB,CAAC;AAED,6BAAiB;AACjB,uCAA2B;AAAA,UAC7B;AACA;AAAA,QACF,KAAK;AACH,sCAA4B,KAAK,MAAM,MAAM,iBAAiB;AAC9D,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,QACF,KAAK;AACH,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AAAA,EAEA,kBAAkB,aAAyC;AACzD,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,oBAAoB;AAClB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA;AAAA,EAGA,UAAU,OAAmB;AAE3B,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,YAAY,MAAM,KAAK;AAAA,EAC9B;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,YAAY,MAAM,UAAU,cAAc;AAAA,EACjD;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,cAAc;AACnB,SAAK,MAAM,SAAS,MAAM;AAAA,EAC5B;AAAA,EAEA,MAAM,OAA0C;AAC9C,WAAO,KAAK,aAAa,KAAK,EAAE,KAAK,CAAC,EAAE,MAAM,MAAM,MAAM;AACxD,UAAI,MAAM;AACR,eAAO,EAAE,MAAM,MAAM,OAAO,OAAU;AAAA,MACxC;AACA,aAAO,EAAE,MAAM,OAAO,MAAM;AAAA,IAC9B,CAAC;AAAA,EACH;AAAA,EAEA,QAAQ;AACN,SAAK,aAAa,YAAY;AAC9B,SAAK,aAAa,OAAO;AACzB,SAAK,OAAO,SAAS,MAAM;AAC3B,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
|
|
1
|
+
{"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type {\n ReadableStream,\n ReadableStreamDefaultReader,\n WritableStreamDefaultWriter,\n} from 'node:stream/web';\nimport { log } from './log.js';\nimport type { VADMetrics } from './metrics/base.js';\nimport { DeferredReadableStream } from './stream/deferred_stream.js';\nimport { IdentityTransform } from './stream/identity_transform.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n METRICS_COLLECTED,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the speech segment. */\n speechDuration: number;\n /** Duration of the silence segment. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n /** Threshold used to detect silence. */\n rawAccumulatedSilence: number;\n /** Threshold used to detect speech. */\n rawAccumulatedSpeech: number;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport type VADCallbacks = {\n ['metrics_collected']: (metrics: VADMetrics) => void;\n};\n\nexport abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {\n #capabilities: VADCapabilities;\n abstract label: string;\n\n constructor(capabilities: VADCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new IdentityTransform<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected output = new IdentityTransform<VADEvent>();\n protected inputWriter: WritableStreamDefaultWriter<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;\n protected inputReader: ReadableStreamDefaultReader<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;\n protected outputWriter: WritableStreamDefaultWriter<VADEvent>;\n protected outputReader: ReadableStreamDefaultReader<VADEvent>;\n protected closed = false;\n protected inputClosed = false;\n\n #vad: VAD;\n #lastActivityTime = BigInt(0);\n private logger = log();\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n\n private metricsStream: ReadableStream<VADEvent>;\n constructor(vad: VAD) {\n this.#vad = vad;\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n\n this.inputWriter = this.input.writable.getWriter();\n this.inputReader = this.input.readable.getReader();\n this.outputWriter = this.output.writable.getWriter();\n\n const [outputStream, metricsStream] = this.output.readable.tee();\n this.metricsStream = metricsStream;\n this.outputReader = outputStream.getReader();\n\n this.pumpDeferredStream();\n this.monitorMetrics();\n }\n\n /**\n * Reads from the deferred input stream and forwards chunks to the input writer.\n *\n * Note: we can't just do this.deferredInputStream.stream.pipeTo(this.input.writable)\n * because the inputWriter locks the this.input.writable stream. All writes must go through\n * the inputWriter.\n */\n private async pumpDeferredStream() {\n const reader = this.deferredInputStream.stream.getReader();\n try {\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n await this.inputWriter.write(value);\n }\n } catch (e) {\n this.logger.error(`Error pumping deferred stream: ${e}`);\n throw e;\n } finally {\n reader.releaseLock();\n }\n }\n\n protected async monitorMetrics() {\n let inferenceDurationTotalMs = 0;\n let inferenceCount = 0;\n const metricsReader = this.metricsStream.getReader();\n while (true) {\n const { done, value } = await metricsReader.read();\n if (done) {\n break;\n }\n switch (value.type) {\n case VADEventType.START_OF_SPEECH:\n inferenceCount++;\n if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {\n this.#vad.emit('metrics_collected', {\n type: 'vad_metrics',\n timestamp: Date.now(),\n idleTimeMs: Math.trunc(\n Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),\n ),\n inferenceDurationTotalMs,\n inferenceCount,\n label: this.#vad.label,\n });\n\n inferenceCount = 0;\n inferenceDurationTotalMs = 0;\n }\n break;\n case VADEventType.INFERENCE_DONE:\n inferenceDurationTotalMs += Math.round(value.inferenceDuration);\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n case VADEventType.END_OF_SPEECH:\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n }\n }\n }\n\n /**\n * Safely send a VAD event to the output stream, handling writer release errors during shutdown.\n * @returns true if the event was sent, false if the stream is closing\n * @throws Error if an unexpected error occurs\n */\n protected sendVADEvent(event: VADEvent): boolean {\n if (this.closed) {\n return false;\n }\n\n try {\n this.outputWriter.write(event);\n return true;\n } catch (e) {\n throw e;\n }\n }\n\n updateInputStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputStream() {\n this.deferredInputStream.detachSource();\n }\n\n /** @deprecated Use `updateInputStream` instead */\n pushFrame(frame: AudioFrame) {\n // TODO(AJS-395): remove this method\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputWriter.write(frame);\n }\n\n flush() {\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputWriter.write(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.inputClosed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.inputClosed = true;\n this.input.writable.close();\n }\n\n async next(): Promise<IteratorResult<VADEvent>> {\n return this.outputReader.read().then(({ done, value }) => {\n if (done) {\n return { done: true, value: undefined };\n }\n return { done: false, value };\n });\n }\n\n close() {\n this.outputWriter.releaseLock();\n this.outputReader.cancel();\n this.output.writable.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":"AAKA,SAAS,oBAAoB;AAM7B,SAAS,WAAW;AAEpB,SAAS,8BAA8B;AACvC,SAAS,yBAAyB;AAE3B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAJU,SAAAA;AAAA,GAAA;AAiDL,MAAe,YAAa,aAAsD;AAAA,EACvF;AAAA,EAGA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,kBAAgE;AAAA,EAC5E,SAAS,IAAI,kBAA4B;AAAA,EACzC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,SAAS;AAAA,EACT,cAAc;AAAA,EAExB;AAAA,EACA,oBAAoB,OAAO,CAAC;AAAA,EACpB,SAAS,IAAI;AAAA,EACb;AAAA,EAEA;AAAA,EACR,YAAY,KAAU;AACpB,SAAK,OAAO;AACZ,SAAK,sBAAsB,IAAI,uBAAmC;AAElE,SAAK,cAAc,KAAK,MAAM,SAAS,UAAU;AACjD,SAAK,cAAc,KAAK,MAAM,SAAS,UAAU;AACjD,SAAK,eAAe,KAAK,OAAO,SAAS,UAAU;AAEnD,UAAM,CAAC,cAAc,aAAa,IAAI,KAAK,OAAO,SAAS,IAAI;AAC/D,SAAK,gBAAgB;AACrB,SAAK,eAAe,aAAa,UAAU;AAE3C,SAAK,mBAAmB;AACxB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,MAAc,qBAAqB;AACjC,UAAM,SAAS,KAAK,oBAAoB,OAAO,UAAU;AACzD,QAAI;AACF,aAAO,MAAM;AACX,cAAM,EAAE,MAAM,MAAM,IAAI,MAAM,OAAO,KAAK;AAC1C,YAAI,KAAM;AACV,cAAM,KAAK,YAAY,MAAM,KAAK;AAAA,MACpC;AAAA,IACF,SAAS,GAAG;AACV,WAAK,OAAO,MAAM,kCAAkC,CAAC,EAAE;AACvD,YAAM;AAAA,IACR,UAAE;AACA,aAAO,YAAY;AAAA,IACrB;AAAA,EACF;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,QAAI,2BAA2B;AAC/B,QAAI,iBAAiB;AACrB,UAAM,gBAAgB,KAAK,cAAc,UAAU;AACnD,WAAO,MAAM;AACX,YAAM,EAAE,MAAM,MAAM,IAAI,MAAM,cAAc,KAAK;AACjD,UAAI,MAAM;AACR;AAAA,MACF;AACA,cAAQ,MAAM,MAAM;AAAA,QAClB,KAAK;AACH;AACA,cAAI,kBAAkB,IAAI,KAAK,KAAK,aAAa,gBAAgB;AAC/D,iBAAK,KAAK,KAAK,qBAAqB;AAAA,cAClC,MAAM;AAAA,cACN,WAAW,KAAK,IAAI;AAAA,cACpB,YAAY,KAAK;AAAA,gBACf,QAAQ,QAAQ,OAAO,OAAO,IAAI,KAAK,qBAAqB,OAAO,GAAO,CAAC;AAAA,cAC7E;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,KAAK,KAAK;AAAA,YACnB,CAAC;AAED,6BAAiB;AACjB,uCAA2B;AAAA,UAC7B;AACA;AAAA,QACF,KAAK;AACH,sCAA4B,KAAK,MAAM,MAAM,iBAAiB;AAC9D,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,QACF,KAAK;AACH,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOU,aAAa,OAA0B;AAC/C,QAAI,KAAK,QAAQ;AACf,aAAO;AAAA,IACT;AAEA,QAAI;AACF,WAAK,aAAa,MAAM,KAAK;AAC7B,aAAO;AAAA,IACT,SAAS,GAAG;AACV,YAAM;AAAA,IACR;AAAA,EACF;AAAA,EAEA,kBAAkB,aAAyC;AACzD,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,oBAAoB;AAClB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA;AAAA,EAGA,UAAU,OAAmB;AAE3B,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,YAAY,MAAM,KAAK;AAAA,EAC9B;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,YAAY,MAAM,UAAU,cAAc;AAAA,EACjD;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,cAAc;AACnB,SAAK,MAAM,SAAS,MAAM;AAAA,EAC5B;AAAA,EAEA,MAAM,OAA0C;AAC9C,WAAO,KAAK,aAAa,KAAK,EAAE,KAAK,CAAC,EAAE,MAAM,MAAM,MAAM;AACxD,UAAI,MAAM;AACR,eAAO,EAAE,MAAM,MAAM,OAAO,OAAU;AAAA,MACxC;AACA,aAAO,EAAE,MAAM,OAAO,MAAM;AAAA,IAC9B,CAAC;AAAA,EACH;AAAA,EAEA,QAAQ;AACN,SAAK,aAAa,YAAY;AAC9B,SAAK,aAAa,OAAO;AACzB,SAAK,OAAO,SAAS,MAAM;AAC3B,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
|
|
@@ -27,6 +27,7 @@ var import_node_async_hooks = require("node:async_hooks");
|
|
|
27
27
|
var import_web = require("node:stream/web");
|
|
28
28
|
var import_chat_context = require("../llm/chat_context.cjs");
|
|
29
29
|
var import_llm = require("../llm/index.cjs");
|
|
30
|
+
var import_tool_context = require("../llm/tool_context.cjs");
|
|
30
31
|
var import_log = require("../log.cjs");
|
|
31
32
|
var import_deferred_stream = require("../stream/deferred_stream.cjs");
|
|
32
33
|
var import_stt = require("../stt/stt.cjs");
|
|
@@ -58,6 +59,7 @@ class AgentActivity {
|
|
|
58
59
|
audioStream = new import_deferred_stream.DeferredReadableStream();
|
|
59
60
|
// default to null as None, which maps to the default provider tool choice value
|
|
60
61
|
toolChoice = null;
|
|
62
|
+
_preemptiveGeneration;
|
|
61
63
|
agent;
|
|
62
64
|
agentSession;
|
|
63
65
|
/** @internal */
|
|
@@ -430,8 +432,12 @@ class AgentActivity {
|
|
|
430
432
|
onStartOfSpeech(_ev) {
|
|
431
433
|
this.agentSession._updateUserState("speaking");
|
|
432
434
|
}
|
|
433
|
-
onEndOfSpeech(
|
|
434
|
-
|
|
435
|
+
onEndOfSpeech(ev) {
|
|
436
|
+
let speechEndTime = Date.now();
|
|
437
|
+
if (ev) {
|
|
438
|
+
speechEndTime = speechEndTime - ev.silenceDuration;
|
|
439
|
+
}
|
|
440
|
+
this.agentSession._updateUserState("listening", speechEndTime);
|
|
435
441
|
}
|
|
436
442
|
onVADInferenceDone(ev) {
|
|
437
443
|
var _a, _b;
|
|
@@ -485,6 +491,44 @@ class AgentActivity {
|
|
|
485
491
|
})
|
|
486
492
|
);
|
|
487
493
|
}
|
|
494
|
+
onPreemptiveGeneration(info) {
|
|
495
|
+
if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
|
|
496
|
+
return;
|
|
497
|
+
}
|
|
498
|
+
this.cancelPreemptiveGeneration();
|
|
499
|
+
this.logger.info(
|
|
500
|
+
{
|
|
501
|
+
newTranscript: info.newTranscript,
|
|
502
|
+
transcriptConfidence: info.transcriptConfidence
|
|
503
|
+
},
|
|
504
|
+
"starting preemptive generation"
|
|
505
|
+
);
|
|
506
|
+
const userMessage = import_chat_context.ChatMessage.create({
|
|
507
|
+
role: "user",
|
|
508
|
+
content: info.newTranscript
|
|
509
|
+
});
|
|
510
|
+
const chatCtx = this.agent.chatCtx.copy();
|
|
511
|
+
const speechHandle = this.generateReply({
|
|
512
|
+
userMessage,
|
|
513
|
+
chatCtx,
|
|
514
|
+
scheduleSpeech: false
|
|
515
|
+
});
|
|
516
|
+
this._preemptiveGeneration = {
|
|
517
|
+
speechHandle,
|
|
518
|
+
userMessage,
|
|
519
|
+
info,
|
|
520
|
+
chatCtx: chatCtx.copy(),
|
|
521
|
+
tools: { ...this.tools },
|
|
522
|
+
toolChoice: this.toolChoice,
|
|
523
|
+
createdAt: Date.now()
|
|
524
|
+
};
|
|
525
|
+
}
|
|
526
|
+
cancelPreemptiveGeneration() {
|
|
527
|
+
if (this._preemptiveGeneration !== void 0) {
|
|
528
|
+
this._preemptiveGeneration.speechHandle._cancel();
|
|
529
|
+
this._preemptiveGeneration = void 0;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
488
532
|
createSpeechTask(options) {
|
|
489
533
|
const { task, ownedSpeechHandle } = options;
|
|
490
534
|
this.speechTasks.add(task);
|
|
@@ -506,10 +550,12 @@ class AgentActivity {
|
|
|
506
550
|
}
|
|
507
551
|
async onEndOfTurn(info) {
|
|
508
552
|
if (this.draining) {
|
|
553
|
+
this.cancelPreemptiveGeneration();
|
|
509
554
|
this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
|
|
510
555
|
return true;
|
|
511
556
|
}
|
|
512
557
|
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0 && info.newTranscript.split(" ").length < this.agentSession.options.minInterruptionWords) {
|
|
558
|
+
this.cancelPreemptiveGeneration();
|
|
513
559
|
this.logger.info("skipping user input, new_transcript is too short");
|
|
514
560
|
return false;
|
|
515
561
|
}
|
|
@@ -563,7 +609,8 @@ class AgentActivity {
|
|
|
563
609
|
chatCtx,
|
|
564
610
|
instructions: defaultInstructions,
|
|
565
611
|
toolChoice: defaultToolChoice,
|
|
566
|
-
allowInterruptions: defaultAllowInterruptions
|
|
612
|
+
allowInterruptions: defaultAllowInterruptions,
|
|
613
|
+
scheduleSpeech = true
|
|
567
614
|
} = options;
|
|
568
615
|
let instructions = defaultInstructions;
|
|
569
616
|
let toolChoice = defaultToolChoice;
|
|
@@ -636,7 +683,9 @@ ${instructions}` : instructions,
|
|
|
636
683
|
});
|
|
637
684
|
task.finally(() => this.onPipelineReplyDone());
|
|
638
685
|
}
|
|
639
|
-
|
|
686
|
+
if (scheduleSpeech) {
|
|
687
|
+
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
688
|
+
}
|
|
640
689
|
return handle;
|
|
641
690
|
}
|
|
642
691
|
interrupt() {
|
|
@@ -709,13 +758,36 @@ ${instructions}` : instructions,
|
|
|
709
758
|
} else if (this.llm === void 0) {
|
|
710
759
|
return;
|
|
711
760
|
}
|
|
712
|
-
|
|
761
|
+
let speechHandle;
|
|
762
|
+
if (this._preemptiveGeneration !== void 0) {
|
|
763
|
+
const preemptive = this._preemptiveGeneration;
|
|
764
|
+
if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && (0, import_tool_context.isSameToolContext)(preemptive.tools, this.tools) && (0, import_tool_context.isSameToolChoice)(preemptive.toolChoice, this.toolChoice)) {
|
|
765
|
+
speechHandle = preemptive.speechHandle;
|
|
766
|
+
this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
767
|
+
this.logger.debug(
|
|
768
|
+
{
|
|
769
|
+
preemptiveLeadTime: Date.now() - preemptive.createdAt
|
|
770
|
+
},
|
|
771
|
+
"using preemptive generation"
|
|
772
|
+
);
|
|
773
|
+
} else {
|
|
774
|
+
this.logger.warn(
|
|
775
|
+
"preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`"
|
|
776
|
+
);
|
|
777
|
+
preemptive.speechHandle._cancel();
|
|
778
|
+
}
|
|
779
|
+
this._preemptiveGeneration = void 0;
|
|
780
|
+
}
|
|
781
|
+
if (speechHandle === void 0) {
|
|
782
|
+
speechHandle = this.generateReply({ userMessage, chatCtx });
|
|
783
|
+
}
|
|
713
784
|
const eouMetrics = {
|
|
714
785
|
type: "eou_metrics",
|
|
715
786
|
timestamp: Date.now(),
|
|
716
787
|
endOfUtteranceDelayMs: info.endOfUtteranceDelay,
|
|
717
788
|
transcriptionDelayMs: info.transcriptionDelay,
|
|
718
789
|
onUserTurnCompletedDelayMs: callbackDuration,
|
|
790
|
+
lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
|
|
719
791
|
speechId: speechHandle.id
|
|
720
792
|
};
|
|
721
793
|
this.agentSession.emit(
|
|
@@ -823,8 +895,6 @@ ${instructions}` : instructions,
|
|
|
823
895
|
chatCtx = chatCtx.copy();
|
|
824
896
|
if (newMessage) {
|
|
825
897
|
chatCtx.insert(newMessage);
|
|
826
|
-
this.agent._chatCtx.insert(newMessage);
|
|
827
|
-
this.agentSession._conversationItemAdded(newMessage);
|
|
828
898
|
}
|
|
829
899
|
if (instructions) {
|
|
830
900
|
try {
|
|
@@ -837,7 +907,6 @@ ${instructions}` : instructions,
|
|
|
837
907
|
this.logger.error({ error: e }, "error occurred during updateInstructions");
|
|
838
908
|
}
|
|
839
909
|
}
|
|
840
|
-
this.agentSession._updateAgentState("thinking");
|
|
841
910
|
const tasks = [];
|
|
842
911
|
const [llmTask, llmGenData] = (0, import_generation.performLLMInference)(
|
|
843
912
|
// preserve `this` context in llmNode
|
|
@@ -861,6 +930,10 @@ ${instructions}` : instructions,
|
|
|
861
930
|
tasks.push(ttsTask);
|
|
862
931
|
}
|
|
863
932
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
933
|
+
if (newMessage && speechHandle.scheduled) {
|
|
934
|
+
this.agent._chatCtx.insert(newMessage);
|
|
935
|
+
this.agentSession._conversationItemAdded(newMessage);
|
|
936
|
+
}
|
|
864
937
|
if (speechHandle.interrupted) {
|
|
865
938
|
replyAbortController.abort();
|
|
866
939
|
await (0, import_utils.cancelAndWait)(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
@@ -1442,6 +1515,7 @@ ${instructions}` : instructions,
|
|
|
1442
1515
|
const unlock = await this.lock.lock();
|
|
1443
1516
|
try {
|
|
1444
1517
|
if (this._draining) return;
|
|
1518
|
+
this.cancelPreemptiveGeneration();
|
|
1445
1519
|
this.createSpeechTask({
|
|
1446
1520
|
task: import_utils.Task.from(() => this.agent.onExit()),
|
|
1447
1521
|
name: "AgentActivity_onExit"
|
|
@@ -1460,6 +1534,7 @@ ${instructions}` : instructions,
|
|
|
1460
1534
|
if (!this._draining) {
|
|
1461
1535
|
this.logger.warn("task closing without draining");
|
|
1462
1536
|
}
|
|
1537
|
+
this.cancelPreemptiveGeneration();
|
|
1463
1538
|
if (this.llm instanceof import_llm.LLM) {
|
|
1464
1539
|
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1465
1540
|
}
|