@livekit/agents 1.0.15 → 1.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +12 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +13 -13
- package/dist/cli.js.map +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -1
- package/dist/inference/tts.d.ts +2 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +1 -5
- package/dist/inference/tts.js.map +1 -1
- package/dist/llm/chat_context.cjs +78 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +16 -0
- package/dist/llm/chat_context.d.ts +16 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +78 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +531 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +531 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/tool_context.cjs +40 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +2 -0
- package/dist/llm/tool_context.d.ts +2 -0
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +38 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +7 -0
- package/dist/metrics/base.d.ts +7 -0
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/stt/stt.cjs +1 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +7 -1
- package/dist/stt/stt.d.ts +7 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +1 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/voice/agent_activity.cjs +83 -8
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +6 -2
- package/dist/voice/agent_activity.d.ts +6 -2
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +83 -8
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +3 -2
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +2 -1
- package/dist/voice/agent_session.d.ts +2 -1
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +3 -2
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +138 -16
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +11 -0
- package/dist/voice/audio_recognition.d.ts +11 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +138 -16
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +0 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/worker.cjs +17 -11
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +16 -9
- package/dist/worker.d.ts +16 -9
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +16 -12
- package/dist/worker.js.map +1 -1
- package/package.json +1 -1
- package/src/cli.ts +17 -17
- package/src/inference/stt.ts +2 -1
- package/src/inference/tts.ts +2 -5
- package/src/llm/chat_context.test.ts +607 -0
- package/src/llm/chat_context.ts +106 -0
- package/src/llm/tool_context.ts +44 -0
- package/src/metrics/base.ts +7 -0
- package/src/stt/stt.ts +6 -0
- package/src/voice/agent_activity.ts +119 -9
- package/src/voice/agent_session.ts +3 -1
- package/src/voice/audio_recognition.ts +235 -57
- package/src/voice/room_io/_input.ts +1 -1
- package/src/worker.ts +29 -18
package/dist/stt/stt.d.cts
CHANGED
|
@@ -29,7 +29,13 @@ export declare enum SpeechEventType {
|
|
|
29
29
|
*/
|
|
30
30
|
END_OF_SPEECH = 3,
|
|
31
31
|
/** Usage event, emitted periodically to indicate usage metrics. */
|
|
32
|
-
RECOGNITION_USAGE = 4
|
|
32
|
+
RECOGNITION_USAGE = 4,
|
|
33
|
+
/**
|
|
34
|
+
* Preflight transcript, emitted before final transcript when STT has high confidence
|
|
35
|
+
* but hasn't fully committed yet. Includes all pre-committed transcripts including
|
|
36
|
+
* final transcript from the previous STT run.
|
|
37
|
+
*/
|
|
38
|
+
PREFLIGHT_TRANSCRIPT = 5
|
|
33
39
|
}
|
|
34
40
|
/** SpeechData contains metadata about this {@link SpeechEvent}. */
|
|
35
41
|
export interface SpeechData {
|
package/dist/stt/stt.d.ts
CHANGED
|
@@ -29,7 +29,13 @@ export declare enum SpeechEventType {
|
|
|
29
29
|
*/
|
|
30
30
|
END_OF_SPEECH = 3,
|
|
31
31
|
/** Usage event, emitted periodically to indicate usage metrics. */
|
|
32
|
-
RECOGNITION_USAGE = 4
|
|
32
|
+
RECOGNITION_USAGE = 4,
|
|
33
|
+
/**
|
|
34
|
+
* Preflight transcript, emitted before final transcript when STT has high confidence
|
|
35
|
+
* but hasn't fully committed yet. Includes all pre-committed transcripts including
|
|
36
|
+
* final transcript from the previous STT run.
|
|
37
|
+
*/
|
|
38
|
+
PREFLIGHT_TRANSCRIPT = 5
|
|
33
39
|
}
|
|
34
40
|
/** SpeechData contains metadata about this {@link SpeechEvent}. */
|
|
35
41
|
export interface SpeechData {
|
package/dist/stt/stt.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../../src/stt/stt.ts"],"names":[],"mappings":";AAGA,OAAO,EAAE,KAAK,UAAU,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AACpE,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAItD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AAErD,OAAO,EAAE,KAAK,iBAAiB,EAA+B,MAAM,aAAa,CAAC;AAClF,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC/C,OAAO,EAAE,kBAAkB,EAA6B,MAAM,aAAa,CAAC;AAE5E,2CAA2C;AAC3C,oBAAY,eAAe;IACzB;;;;OAIG;IACH,eAAe,IAAI;IACnB;;OAEG;IACH,kBAAkB,IAAI;IACtB;;;OAGG;IACH,gBAAgB,IAAI;IACpB;;;OAGG;IACH,aAAa,IAAI;IACjB,mEAAmE;IACnE,iBAAiB,IAAI;
|
|
1
|
+
{"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../../src/stt/stt.ts"],"names":[],"mappings":";AAGA,OAAO,EAAE,KAAK,UAAU,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AACpE,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAItD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AAErD,OAAO,EAAE,KAAK,iBAAiB,EAA+B,MAAM,aAAa,CAAC;AAClF,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC/C,OAAO,EAAE,kBAAkB,EAA6B,MAAM,aAAa,CAAC;AAE5E,2CAA2C;AAC3C,oBAAY,eAAe;IACzB;;;;OAIG;IACH,eAAe,IAAI;IACnB;;OAEG;IACH,kBAAkB,IAAI;IACtB;;;OAGG;IACH,gBAAgB,IAAI;IACpB;;;OAGG;IACH,aAAa,IAAI;IACjB,mEAAmE;IACnE,iBAAiB,IAAI;IACrB;;;;OAIG;IACH,oBAAoB,IAAI;CACzB;AAED,mEAAmE;AACnE,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,sDAAsD;AACtD,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,eAAe,CAAC;IACtB,YAAY,CAAC,EAAE,CAAC,UAAU,EAAE,GAAG,UAAU,EAAE,CAAC,CAAC;IAC7C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,gBAAgB,CAAC;CACrC;AAED;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,OAAO,CAAC;IACnB,cAAc,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,WAAW,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,KAAK,CAAC;IACb,WAAW,EAAE,OAAO,CAAC;CACtB;AAED,MAAM,MAAM,YAAY,GAAG;IACzB,CAAC,mBAAmB,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,KAAK,IAAI,CAAC;IACrD,CAAC,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,QAAQ,KAAK,IAAI,CAAC;CACtC,CAAC;kCAS2D,aAAa,YAAY,CAAC;AAPvF;;;;;;GAMG;AACH,8BAAsB,GAAI,SAAQ,QAAsD;;IACtF,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAGX,YAAY,EAAE,eAAe;IAKzC,sCAAsC;IACtC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED,8FAA8F;IACxF,SAAS,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC;IAezD,SAAS,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC;IAEvE;;;OAGG;IACH,QAAQ,CAAC,MAAM,IAAI,YAAY;CAChC;AAED;;;;;;;;;;;;;;;GAeG;AACH,8BAAsB,YAAa,YAAW,qBAAqB,CAAC,WAAW,CAAC;;IAC9E,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,SAAS,CAAC,KAAK,sEAA6E;IAC5F,SAAS,CAAC,MAAM,kCAAyC;IACzD,SAAS,CAAC,KAAK,kCAAyC;IACxD,SAAS,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;IACpC,SAAS,CAAC,SAAS,CAAC,EAAE,cAAc,CAAC;IACrC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,MAAM,UAAS;IAEzB,OAAO,CAAC,mBAAmB,CAAqC;IAChE,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,YAAY,CAAoB;gBAGtC,GAAG,EAAE,GAAG,EACR,UAAU,CAAC,EAAE,MAAM,EACnB,iBAAiB,GAAE,iBAA+C;YAgBtD,QAAQ;IAoCtB,OAAO,CAAC,SAAS;cAUD,SAAS;cAkBT,cAAc;IAkB9B,SAAS,CAAC,QAAQ,CAAC,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;IAEvC,iBAAiB,CAAC,WAAW,EAAE,cAAc,CAAC,UAAU,CAAC;IAIzD,iBAAiB;IAIjB,qCAAqC;IACrC,SAAS,CAAC,KAAK,EAAE,UAAU;IAwB3B,4DAA4D;IAC5D,KAAK;IAUL,2DAA2D;IAC3D,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,WAAW,CAAC,CAAC;IAI5C,wDAAwD;IACxD,KAAK;IAOL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,YAAY;CAGvC"}
|
package/dist/stt/stt.js
CHANGED
|
@@ -12,6 +12,7 @@ var SpeechEventType = /* @__PURE__ */ ((SpeechEventType2) => {
|
|
|
12
12
|
SpeechEventType2[SpeechEventType2["FINAL_TRANSCRIPT"] = 2] = "FINAL_TRANSCRIPT";
|
|
13
13
|
SpeechEventType2[SpeechEventType2["END_OF_SPEECH"] = 3] = "END_OF_SPEECH";
|
|
14
14
|
SpeechEventType2[SpeechEventType2["RECOGNITION_USAGE"] = 4] = "RECOGNITION_USAGE";
|
|
15
|
+
SpeechEventType2[SpeechEventType2["PREFLIGHT_TRANSCRIPT"] = 5] = "PREFLIGHT_TRANSCRIPT";
|
|
15
16
|
return SpeechEventType2;
|
|
16
17
|
})(SpeechEventType || {});
|
|
17
18
|
class STT extends EventEmitter {
|
package/dist/stt/stt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/stt/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioFrame, AudioResampler } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { ReadableStream } from 'node:stream/web';\nimport { APIConnectionError, APIError } from '../_exceptions.js';\nimport { calculateAudioDurationSeconds } from '../audio.js';\nimport { log } from '../log.js';\nimport type { STTMetrics } from '../metrics/base.js';\nimport { DeferredReadableStream } from '../stream/deferred_stream.js';\nimport { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';\nimport type { AudioBuffer } from '../utils.js';\nimport { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';\n\n/** Indicates start/middle/end of speech */\nexport enum SpeechEventType {\n /**\n * Indicate the start of speech.\n * If the STT doesn't support this event, this will be emitted at the same time\n * as the first INTERIM_TRANSCRIPT.\n */\n START_OF_SPEECH = 0,\n /**\n * Interim transcript, useful for real-time transcription.\n */\n INTERIM_TRANSCRIPT = 1,\n /**\n * Final transcript, emitted when the STT is confident enough that a certain\n * portion of the speech will not change.\n */\n FINAL_TRANSCRIPT = 2,\n /**\n * Indicate the end of speech, emitted when the user stops speaking.\n * The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.\n */\n END_OF_SPEECH = 3,\n /** Usage event, emitted periodically to indicate usage metrics. */\n RECOGNITION_USAGE = 4,\n}\n\n/** SpeechData contains metadata about this {@link SpeechEvent}. */\nexport interface SpeechData {\n language: string;\n text: string;\n startTime: number;\n endTime: number;\n confidence: number;\n}\n\nexport interface RecognitionUsage {\n audioDuration: number;\n}\n\n/** SpeechEvent is a packet of speech-to-text data. */\nexport interface SpeechEvent {\n type: SpeechEventType;\n alternatives?: [SpeechData, ...SpeechData[]];\n requestId?: string;\n recognitionUsage?: RecognitionUsage;\n}\n\n/**\n * Describes the capabilities of the STT provider.\n *\n * @remarks\n * At present, the framework only supports providers that have a streaming endpoint.\n */\nexport interface STTCapabilities {\n streaming: boolean;\n interimResults: boolean;\n}\n\nexport interface STTError {\n type: 'stt_error';\n timestamp: number;\n label: string;\n error: Error;\n recoverable: boolean;\n}\n\nexport type STTCallbacks = {\n ['metrics_collected']: (metrics: STTMetrics) => void;\n ['error']: (error: STTError) => void;\n};\n\n/**\n * An instance of a speech-to-text adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child STT class, which inherits this class's methods.\n */\nexport abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCallbacks>) {\n abstract label: string;\n #capabilities: STTCapabilities;\n\n constructor(capabilities: STTCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n /** Returns this STT's capabilities */\n get capabilities(): STTCapabilities {\n return this.#capabilities;\n }\n\n /** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */\n async recognize(frame: AudioBuffer): Promise<SpeechEvent> {\n const startTime = process.hrtime.bigint();\n const event = await this._recognize(frame);\n const durationMs = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.emit('metrics_collected', {\n type: 'stt_metrics',\n requestId: event.requestId ?? '',\n timestamp: Date.now(),\n durationMs,\n label: this.label,\n audioDurationMs: Math.round(calculateAudioDurationSeconds(frame) * 1000),\n streamed: false,\n });\n return event;\n }\n protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;\n\n /**\n * Returns a {@link SpeechStream} that can be used to push audio frames and receive\n * transcriptions\n */\n abstract stream(): SpeechStream;\n}\n\n/**\n * An instance of a speech-to-text stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {\n * console.log(event.alternatives[0].text)\n * }\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SpeechStream class, which inherits this class's methods.\n */\nexport abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();\n protected output = new AsyncIterableQueue<SpeechEvent>();\n protected queue = new AsyncIterableQueue<SpeechEvent>();\n protected neededSampleRate?: number;\n protected resampler?: AudioResampler;\n abstract label: string;\n protected closed = false;\n #stt: STT;\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n private logger = log();\n private _connOptions: APIConnectOptions;\n\n constructor(\n stt: STT,\n sampleRate?: number,\n connectionOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,\n ) {\n this.#stt = stt;\n this._connOptions = connectionOptions;\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n this.neededSampleRate = sampleRate;\n this.monitorMetrics();\n this.pumpInput();\n\n // this is a hack to immitate asyncio.create_task so that mainTask\n // is run **after** the constructor has finished. Otherwise we get\n // runtime error when trying to access class variables in the\n // `run` method.\n startSoon(() => this.mainTask().then(() => this.queue.close()));\n }\n\n private async mainTask() {\n for (let i = 0; i < this._connOptions.maxRetry + 1; i++) {\n try {\n return await this.run();\n } catch (error) {\n if (error instanceof APIError) {\n const retryInterval = this._connOptions._intervalForRetry(i);\n\n if (this._connOptions.maxRetry === 0 || !error.retryable) {\n this.emitError({ error, recoverable: false });\n throw error;\n } else if (i === this._connOptions.maxRetry) {\n this.emitError({ error, recoverable: false });\n throw new APIConnectionError({\n message: `failed to recognize speech after ${this._connOptions.maxRetry + 1} attempts`,\n options: { retryable: false },\n });\n } else {\n this.emitError({ error, recoverable: true });\n this.logger.warn(\n { tts: this.#stt.label, attempt: i + 1, error },\n `failed to recognize speech, retrying in ${retryInterval}s`,\n );\n }\n\n if (retryInterval > 0) {\n await delay(retryInterval);\n }\n } else {\n this.emitError({ error: toError(error), recoverable: false });\n throw error;\n }\n }\n }\n }\n\n private emitError({ error, recoverable }: { error: Error; recoverable: boolean }) {\n this.#stt.emit('error', {\n type: 'stt_error',\n timestamp: Date.now(),\n label: this.#stt.label,\n error,\n recoverable,\n });\n }\n\n protected async pumpInput() {\n // TODO(AJS-35): Implement STT with webstreams API\n const inputStream = this.deferredInputStream.stream;\n const reader = inputStream.getReader();\n\n try {\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n this.pushFrame(value);\n }\n } catch (error) {\n this.logger.error('Error in STTStream mainTask:', error);\n } finally {\n reader.releaseLock();\n }\n }\n\n protected async monitorMetrics() {\n for await (const event of this.queue) {\n this.output.put(event);\n if (event.type !== SpeechEventType.RECOGNITION_USAGE) continue;\n const metrics: STTMetrics = {\n type: 'stt_metrics',\n timestamp: Date.now(),\n requestId: event.requestId!,\n durationMs: 0,\n label: this.#stt.label,\n audioDurationMs: Math.round(event.recognitionUsage!.audioDuration * 1000),\n streamed: true,\n };\n this.#stt.emit('metrics_collected', metrics);\n }\n this.output.close();\n }\n\n protected abstract run(): Promise<void>;\n\n updateInputStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputStream() {\n this.deferredInputStream.detachSource();\n }\n\n /** Push an audio frame to the STT */\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n if (this.neededSampleRate && frame.sampleRate !== this.neededSampleRate) {\n if (!this.resampler) {\n this.resampler = new AudioResampler(frame.sampleRate, this.neededSampleRate);\n }\n }\n\n if (this.resampler) {\n const frames = this.resampler.push(frame);\n for (const frame of frames) {\n this.input.put(frame);\n }\n } else {\n this.input.put(frame);\n }\n }\n\n /** Flush the STT, causing it to process all pending text */\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SpeechStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SpeechEvent>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the STT stream */\n close() {\n this.input.close();\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SpeechStream {\n return this;\n }\n}\n"],"mappings":"AAGA,SAA0B,sBAAsB;AAEhD,SAAS,oBAAoB;AAE7B,SAAS,oBAAoB,gBAAgB;AAC7C,SAAS,qCAAqC;AAC9C,SAAS,WAAW;AAEpB,SAAS,8BAA8B;AACvC,SAAiC,mCAAmC;AAEpE,SAAS,oBAAoB,OAAO,WAAW,eAAe;AAGvD,IAAK,kBAAL,kBAAKA,qBAAL;AAML,EAAAA,kCAAA,qBAAkB,KAAlB;AAIA,EAAAA,kCAAA,wBAAqB,KAArB;AAKA,EAAAA,kCAAA,sBAAmB,KAAnB;AAKA,EAAAA,kCAAA,mBAAgB,KAAhB;AAEA,EAAAA,kCAAA,uBAAoB,KAApB;AAtBU,SAAAA;AAAA,GAAA;AA6EL,MAAe,YAAa,aAAsD;AAAA,EAEvF;AAAA,EAEA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,MAAM,UAAU,OAA0C;AACxD,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,UAAM,QAAQ,MAAM,KAAK,WAAW,KAAK;AACzC,UAAM,aAAa,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACjF,SAAK,KAAK,qBAAqB;AAAA,MAC7B,MAAM;AAAA,MACN,WAAW,MAAM,aAAa;AAAA,MAC9B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,OAAO,KAAK;AAAA,MACZ,iBAAiB,KAAK,MAAM,8BAA8B,KAAK,IAAI,GAAI;AAAA,MACvE,UAAU;AAAA,IACZ,CAAC;AACD,WAAO;AAAA,EACT;AAQF;AAkBO,MAAe,aAA2D;AAAA,EAC/E,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,mBAAoE;AAAA,EAChF,SAAS,IAAI,mBAAgC;AAAA,EAC7C,QAAQ,IAAI,mBAAgC;AAAA,EAC5C;AAAA,EACA;AAAA,EAEA,SAAS;AAAA,EACnB;AAAA,EACQ;AAAA,EACA,SAAS,IAAI;AAAA,EACb;AAAA,EAER,YACE,KACA,YACA,oBAAuC,6BACvC;AACA,SAAK,OAAO;AACZ,SAAK,eAAe;AACpB,SAAK,sBAAsB,IAAI,uBAAmC;AAClE,SAAK,mBAAmB;AACxB,SAAK,eAAe;AACpB,SAAK,UAAU;AAMf,cAAU,MAAM,KAAK,SAAS,EAAE,KAAK,MAAM,KAAK,MAAM,MAAM,CAAC,CAAC;AAAA,EAChE;AAAA,EAEA,MAAc,WAAW;AACvB,aAAS,IAAI,GAAG,IAAI,KAAK,aAAa,WAAW,GAAG,KAAK;AACvD,UAAI;AACF,eAAO,MAAM,KAAK,IAAI;AAAA,MACxB,SAAS,OAAO;AACd,YAAI,iBAAiB,UAAU;AAC7B,gBAAM,gBAAgB,KAAK,aAAa,kBAAkB,CAAC;AAE3D,cAAI,KAAK,aAAa,aAAa,KAAK,CAAC,MAAM,WAAW;AACxD,iBAAK,UAAU,EAAE,OAAO,aAAa,MAAM,CAAC;AAC5C,kBAAM;AAAA,UACR,WAAW,MAAM,KAAK,aAAa,UAAU;AAC3C,iBAAK,UAAU,EAAE,OAAO,aAAa,MAAM,CAAC;AAC5C,kBAAM,IAAI,mBAAmB;AAAA,cAC3B,SAAS,oCAAoC,KAAK,aAAa,WAAW,CAAC;AAAA,cAC3E,SAAS,EAAE,WAAW,MAAM;AAAA,YAC9B,CAAC;AAAA,UACH,OAAO;AACL,iBAAK,UAAU,EAAE,OAAO,aAAa,KAAK,CAAC;AAC3C,iBAAK,OAAO;AAAA,cACV,EAAE,KAAK,KAAK,KAAK,OAAO,SAAS,IAAI,GAAG,MAAM;AAAA,cAC9C,2CAA2C,aAAa;AAAA,YAC1D;AAAA,UACF;AAEA,cAAI,gBAAgB,GAAG;AACrB,kBAAM,MAAM,aAAa;AAAA,UAC3B;AAAA,QACF,OAAO;AACL,eAAK,UAAU,EAAE,OAAO,QAAQ,KAAK,GAAG,aAAa,MAAM,CAAC;AAC5D,gBAAM;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,UAAU,EAAE,OAAO,YAAY,GAA2C;AAChF,SAAK,KAAK,KAAK,SAAS;AAAA,MACtB,MAAM;AAAA,MACN,WAAW,KAAK,IAAI;AAAA,MACpB,OAAO,KAAK,KAAK;AAAA,MACjB;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAgB,YAAY;AAE1B,UAAM,cAAc,KAAK,oBAAoB;AAC7C,UAAM,SAAS,YAAY,UAAU;AAErC,QAAI;AACF,aAAO,MAAM;AACX,cAAM,EAAE,MAAM,MAAM,IAAI,MAAM,OAAO,KAAK;AAC1C,YAAI,KAAM;AACV,aAAK,UAAU,KAAK;AAAA,MACtB;AAAA,IACF,SAAS,OAAO;AACd,WAAK,OAAO,MAAM,gCAAgC,KAAK;AAAA,IACzD,UAAE;AACA,aAAO,YAAY;AAAA,IACrB;AAAA,EACF;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,MAAM,SAAS,0BAAmC;AACtD,YAAM,UAAsB;AAAA,QAC1B,MAAM;AAAA,QACN,WAAW,KAAK,IAAI;AAAA,QACpB,WAAW,MAAM;AAAA,QACjB,YAAY;AAAA,QACZ,OAAO,KAAK,KAAK;AAAA,QACjB,iBAAiB,KAAK,MAAM,MAAM,iBAAkB,gBAAgB,GAAI;AAAA,QACxE,UAAU;AAAA,MACZ;AACA,WAAK,KAAK,KAAK,qBAAqB,OAAO;AAAA,IAC7C;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA,EAIA,kBAAkB,aAAyC;AACzD,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,oBAAoB;AAClB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA;AAAA,EAGA,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,QAAI,KAAK,oBAAoB,MAAM,eAAe,KAAK,kBAAkB;AACvE,UAAI,CAAC,KAAK,WAAW;AACnB,aAAK,YAAY,IAAI,eAAe,MAAM,YAAY,KAAK,gBAAgB;AAAA,MAC7E;AAAA,IACF;AAEA,QAAI,KAAK,WAAW;AAClB,YAAM,SAAS,KAAK,UAAU,KAAK,KAAK;AACxC,iBAAWC,UAAS,QAAQ;AAC1B,aAAK,MAAM,IAAIA,MAAK;AAAA,MACtB;AAAA,IACF,OAAO;AACL,WAAK,MAAM,IAAI,KAAK;AAAA,IACtB;AAAA,EACF;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,aAAa,cAAc;AAAA,EAC5C;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA6C;AAC3C,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAkB;AACrC,WAAO;AAAA,EACT;AACF;","names":["SpeechEventType","frame"]}
|
|
1
|
+
{"version":3,"sources":["../../src/stt/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioFrame, AudioResampler } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { ReadableStream } from 'node:stream/web';\nimport { APIConnectionError, APIError } from '../_exceptions.js';\nimport { calculateAudioDurationSeconds } from '../audio.js';\nimport { log } from '../log.js';\nimport type { STTMetrics } from '../metrics/base.js';\nimport { DeferredReadableStream } from '../stream/deferred_stream.js';\nimport { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';\nimport type { AudioBuffer } from '../utils.js';\nimport { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';\n\n/** Indicates start/middle/end of speech */\nexport enum SpeechEventType {\n /**\n * Indicate the start of speech.\n * If the STT doesn't support this event, this will be emitted at the same time\n * as the first INTERIM_TRANSCRIPT.\n */\n START_OF_SPEECH = 0,\n /**\n * Interim transcript, useful for real-time transcription.\n */\n INTERIM_TRANSCRIPT = 1,\n /**\n * Final transcript, emitted when the STT is confident enough that a certain\n * portion of the speech will not change.\n */\n FINAL_TRANSCRIPT = 2,\n /**\n * Indicate the end of speech, emitted when the user stops speaking.\n * The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.\n */\n END_OF_SPEECH = 3,\n /** Usage event, emitted periodically to indicate usage metrics. */\n RECOGNITION_USAGE = 4,\n /**\n * Preflight transcript, emitted before final transcript when STT has high confidence\n * but hasn't fully committed yet. Includes all pre-committed transcripts including\n * final transcript from the previous STT run.\n */\n PREFLIGHT_TRANSCRIPT = 5,\n}\n\n/** SpeechData contains metadata about this {@link SpeechEvent}. */\nexport interface SpeechData {\n language: string;\n text: string;\n startTime: number;\n endTime: number;\n confidence: number;\n}\n\nexport interface RecognitionUsage {\n audioDuration: number;\n}\n\n/** SpeechEvent is a packet of speech-to-text data. */\nexport interface SpeechEvent {\n type: SpeechEventType;\n alternatives?: [SpeechData, ...SpeechData[]];\n requestId?: string;\n recognitionUsage?: RecognitionUsage;\n}\n\n/**\n * Describes the capabilities of the STT provider.\n *\n * @remarks\n * At present, the framework only supports providers that have a streaming endpoint.\n */\nexport interface STTCapabilities {\n streaming: boolean;\n interimResults: boolean;\n}\n\nexport interface STTError {\n type: 'stt_error';\n timestamp: number;\n label: string;\n error: Error;\n recoverable: boolean;\n}\n\nexport type STTCallbacks = {\n ['metrics_collected']: (metrics: STTMetrics) => void;\n ['error']: (error: STTError) => void;\n};\n\n/**\n * An instance of a speech-to-text adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child STT class, which inherits this class's methods.\n */\nexport abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCallbacks>) {\n abstract label: string;\n #capabilities: STTCapabilities;\n\n constructor(capabilities: STTCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n /** Returns this STT's capabilities */\n get capabilities(): STTCapabilities {\n return this.#capabilities;\n }\n\n /** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */\n async recognize(frame: AudioBuffer): Promise<SpeechEvent> {\n const startTime = process.hrtime.bigint();\n const event = await this._recognize(frame);\n const durationMs = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.emit('metrics_collected', {\n type: 'stt_metrics',\n requestId: event.requestId ?? '',\n timestamp: Date.now(),\n durationMs,\n label: this.label,\n audioDurationMs: Math.round(calculateAudioDurationSeconds(frame) * 1000),\n streamed: false,\n });\n return event;\n }\n protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;\n\n /**\n * Returns a {@link SpeechStream} that can be used to push audio frames and receive\n * transcriptions\n */\n abstract stream(): SpeechStream;\n}\n\n/**\n * An instance of a speech-to-text stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {\n * console.log(event.alternatives[0].text)\n * }\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SpeechStream class, which inherits this class's methods.\n */\nexport abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();\n protected output = new AsyncIterableQueue<SpeechEvent>();\n protected queue = new AsyncIterableQueue<SpeechEvent>();\n protected neededSampleRate?: number;\n protected resampler?: AudioResampler;\n abstract label: string;\n protected closed = false;\n #stt: STT;\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n private logger = log();\n private _connOptions: APIConnectOptions;\n\n constructor(\n stt: STT,\n sampleRate?: number,\n connectionOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,\n ) {\n this.#stt = stt;\n this._connOptions = connectionOptions;\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n this.neededSampleRate = sampleRate;\n this.monitorMetrics();\n this.pumpInput();\n\n // this is a hack to immitate asyncio.create_task so that mainTask\n // is run **after** the constructor has finished. Otherwise we get\n // runtime error when trying to access class variables in the\n // `run` method.\n startSoon(() => this.mainTask().then(() => this.queue.close()));\n }\n\n private async mainTask() {\n for (let i = 0; i < this._connOptions.maxRetry + 1; i++) {\n try {\n return await this.run();\n } catch (error) {\n if (error instanceof APIError) {\n const retryInterval = this._connOptions._intervalForRetry(i);\n\n if (this._connOptions.maxRetry === 0 || !error.retryable) {\n this.emitError({ error, recoverable: false });\n throw error;\n } else if (i === this._connOptions.maxRetry) {\n this.emitError({ error, recoverable: false });\n throw new APIConnectionError({\n message: `failed to recognize speech after ${this._connOptions.maxRetry + 1} attempts`,\n options: { retryable: false },\n });\n } else {\n this.emitError({ error, recoverable: true });\n this.logger.warn(\n { tts: this.#stt.label, attempt: i + 1, error },\n `failed to recognize speech, retrying in ${retryInterval}s`,\n );\n }\n\n if (retryInterval > 0) {\n await delay(retryInterval);\n }\n } else {\n this.emitError({ error: toError(error), recoverable: false });\n throw error;\n }\n }\n }\n }\n\n private emitError({ error, recoverable }: { error: Error; recoverable: boolean }) {\n this.#stt.emit('error', {\n type: 'stt_error',\n timestamp: Date.now(),\n label: this.#stt.label,\n error,\n recoverable,\n });\n }\n\n protected async pumpInput() {\n // TODO(AJS-35): Implement STT with webstreams API\n const inputStream = this.deferredInputStream.stream;\n const reader = inputStream.getReader();\n\n try {\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n this.pushFrame(value);\n }\n } catch (error) {\n this.logger.error('Error in STTStream mainTask:', error);\n } finally {\n reader.releaseLock();\n }\n }\n\n protected async monitorMetrics() {\n for await (const event of this.queue) {\n this.output.put(event);\n if (event.type !== SpeechEventType.RECOGNITION_USAGE) continue;\n const metrics: STTMetrics = {\n type: 'stt_metrics',\n timestamp: Date.now(),\n requestId: event.requestId!,\n durationMs: 0,\n label: this.#stt.label,\n audioDurationMs: Math.round(event.recognitionUsage!.audioDuration * 1000),\n streamed: true,\n };\n this.#stt.emit('metrics_collected', metrics);\n }\n this.output.close();\n }\n\n protected abstract run(): Promise<void>;\n\n updateInputStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputStream() {\n this.deferredInputStream.detachSource();\n }\n\n /** Push an audio frame to the STT */\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n if (this.neededSampleRate && frame.sampleRate !== this.neededSampleRate) {\n if (!this.resampler) {\n this.resampler = new AudioResampler(frame.sampleRate, this.neededSampleRate);\n }\n }\n\n if (this.resampler) {\n const frames = this.resampler.push(frame);\n for (const frame of frames) {\n this.input.put(frame);\n }\n } else {\n this.input.put(frame);\n }\n }\n\n /** Flush the STT, causing it to process all pending text */\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SpeechStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SpeechEvent>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the STT stream */\n close() {\n this.input.close();\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SpeechStream {\n return this;\n }\n}\n"],"mappings":"AAGA,SAA0B,sBAAsB;AAEhD,SAAS,oBAAoB;AAE7B,SAAS,oBAAoB,gBAAgB;AAC7C,SAAS,qCAAqC;AAC9C,SAAS,WAAW;AAEpB,SAAS,8BAA8B;AACvC,SAAiC,mCAAmC;AAEpE,SAAS,oBAAoB,OAAO,WAAW,eAAe;AAGvD,IAAK,kBAAL,kBAAKA,qBAAL;AAML,EAAAA,kCAAA,qBAAkB,KAAlB;AAIA,EAAAA,kCAAA,wBAAqB,KAArB;AAKA,EAAAA,kCAAA,sBAAmB,KAAnB;AAKA,EAAAA,kCAAA,mBAAgB,KAAhB;AAEA,EAAAA,kCAAA,uBAAoB,KAApB;AAMA,EAAAA,kCAAA,0BAAuB,KAAvB;AA5BU,SAAAA;AAAA,GAAA;AAmFL,MAAe,YAAa,aAAsD;AAAA,EAEvF;AAAA,EAEA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,MAAM,UAAU,OAA0C;AACxD,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,UAAM,QAAQ,MAAM,KAAK,WAAW,KAAK;AACzC,UAAM,aAAa,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AACjF,SAAK,KAAK,qBAAqB;AAAA,MAC7B,MAAM;AAAA,MACN,WAAW,MAAM,aAAa;AAAA,MAC9B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,OAAO,KAAK;AAAA,MACZ,iBAAiB,KAAK,MAAM,8BAA8B,KAAK,IAAI,GAAI;AAAA,MACvE,UAAU;AAAA,IACZ,CAAC;AACD,WAAO;AAAA,EACT;AAQF;AAkBO,MAAe,aAA2D;AAAA,EAC/E,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,mBAAoE;AAAA,EAChF,SAAS,IAAI,mBAAgC;AAAA,EAC7C,QAAQ,IAAI,mBAAgC;AAAA,EAC5C;AAAA,EACA;AAAA,EAEA,SAAS;AAAA,EACnB;AAAA,EACQ;AAAA,EACA,SAAS,IAAI;AAAA,EACb;AAAA,EAER,YACE,KACA,YACA,oBAAuC,6BACvC;AACA,SAAK,OAAO;AACZ,SAAK,eAAe;AACpB,SAAK,sBAAsB,IAAI,uBAAmC;AAClE,SAAK,mBAAmB;AACxB,SAAK,eAAe;AACpB,SAAK,UAAU;AAMf,cAAU,MAAM,KAAK,SAAS,EAAE,KAAK,MAAM,KAAK,MAAM,MAAM,CAAC,CAAC;AAAA,EAChE;AAAA,EAEA,MAAc,WAAW;AACvB,aAAS,IAAI,GAAG,IAAI,KAAK,aAAa,WAAW,GAAG,KAAK;AACvD,UAAI;AACF,eAAO,MAAM,KAAK,IAAI;AAAA,MACxB,SAAS,OAAO;AACd,YAAI,iBAAiB,UAAU;AAC7B,gBAAM,gBAAgB,KAAK,aAAa,kBAAkB,CAAC;AAE3D,cAAI,KAAK,aAAa,aAAa,KAAK,CAAC,MAAM,WAAW;AACxD,iBAAK,UAAU,EAAE,OAAO,aAAa,MAAM,CAAC;AAC5C,kBAAM;AAAA,UACR,WAAW,MAAM,KAAK,aAAa,UAAU;AAC3C,iBAAK,UAAU,EAAE,OAAO,aAAa,MAAM,CAAC;AAC5C,kBAAM,IAAI,mBAAmB;AAAA,cAC3B,SAAS,oCAAoC,KAAK,aAAa,WAAW,CAAC;AAAA,cAC3E,SAAS,EAAE,WAAW,MAAM;AAAA,YAC9B,CAAC;AAAA,UACH,OAAO;AACL,iBAAK,UAAU,EAAE,OAAO,aAAa,KAAK,CAAC;AAC3C,iBAAK,OAAO;AAAA,cACV,EAAE,KAAK,KAAK,KAAK,OAAO,SAAS,IAAI,GAAG,MAAM;AAAA,cAC9C,2CAA2C,aAAa;AAAA,YAC1D;AAAA,UACF;AAEA,cAAI,gBAAgB,GAAG;AACrB,kBAAM,MAAM,aAAa;AAAA,UAC3B;AAAA,QACF,OAAO;AACL,eAAK,UAAU,EAAE,OAAO,QAAQ,KAAK,GAAG,aAAa,MAAM,CAAC;AAC5D,gBAAM;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,UAAU,EAAE,OAAO,YAAY,GAA2C;AAChF,SAAK,KAAK,KAAK,SAAS;AAAA,MACtB,MAAM;AAAA,MACN,WAAW,KAAK,IAAI;AAAA,MACpB,OAAO,KAAK,KAAK;AAAA,MACjB;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAgB,YAAY;AAE1B,UAAM,cAAc,KAAK,oBAAoB;AAC7C,UAAM,SAAS,YAAY,UAAU;AAErC,QAAI;AACF,aAAO,MAAM;AACX,cAAM,EAAE,MAAM,MAAM,IAAI,MAAM,OAAO,KAAK;AAC1C,YAAI,KAAM;AACV,aAAK,UAAU,KAAK;AAAA,MACtB;AAAA,IACF,SAAS,OAAO;AACd,WAAK,OAAO,MAAM,gCAAgC,KAAK;AAAA,IACzD,UAAE;AACA,aAAO,YAAY;AAAA,IACrB;AAAA,EACF;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,MAAM,SAAS,0BAAmC;AACtD,YAAM,UAAsB;AAAA,QAC1B,MAAM;AAAA,QACN,WAAW,KAAK,IAAI;AAAA,QACpB,WAAW,MAAM;AAAA,QACjB,YAAY;AAAA,QACZ,OAAO,KAAK,KAAK;AAAA,QACjB,iBAAiB,KAAK,MAAM,MAAM,iBAAkB,gBAAgB,GAAI;AAAA,QACxE,UAAU;AAAA,MACZ;AACA,WAAK,KAAK,KAAK,qBAAqB,OAAO;AAAA,IAC7C;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA,EAIA,kBAAkB,aAAyC;AACzD,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,oBAAoB;AAClB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA;AAAA,EAGA,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,QAAI,KAAK,oBAAoB,MAAM,eAAe,KAAK,kBAAkB;AACvE,UAAI,CAAC,KAAK,WAAW;AACnB,aAAK,YAAY,IAAI,eAAe,MAAM,YAAY,KAAK,gBAAgB;AAAA,MAC7E;AAAA,IACF;AAEA,QAAI,KAAK,WAAW;AAClB,YAAM,SAAS,KAAK,UAAU,KAAK,KAAK;AACxC,iBAAWC,UAAS,QAAQ;AAC1B,aAAK,MAAM,IAAIA,MAAK;AAAA,MACtB;AAAA,IACF,OAAO;AACL,WAAK,MAAM,IAAI,KAAK;AAAA,IACtB;AAAA,EACF;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,aAAa,cAAc;AAAA,EAC5C;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA6C;AAC3C,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAkB;AACrC,WAAO;AAAA,EACT;AACF;","names":["SpeechEventType","frame"]}
|
|
@@ -27,6 +27,7 @@ var import_node_async_hooks = require("node:async_hooks");
|
|
|
27
27
|
var import_web = require("node:stream/web");
|
|
28
28
|
var import_chat_context = require("../llm/chat_context.cjs");
|
|
29
29
|
var import_llm = require("../llm/index.cjs");
|
|
30
|
+
var import_tool_context = require("../llm/tool_context.cjs");
|
|
30
31
|
var import_log = require("../log.cjs");
|
|
31
32
|
var import_deferred_stream = require("../stream/deferred_stream.cjs");
|
|
32
33
|
var import_stt = require("../stt/stt.cjs");
|
|
@@ -58,6 +59,7 @@ class AgentActivity {
|
|
|
58
59
|
audioStream = new import_deferred_stream.DeferredReadableStream();
|
|
59
60
|
// default to null as None, which maps to the default provider tool choice value
|
|
60
61
|
toolChoice = null;
|
|
62
|
+
_preemptiveGeneration;
|
|
61
63
|
agent;
|
|
62
64
|
agentSession;
|
|
63
65
|
/** @internal */
|
|
@@ -430,8 +432,12 @@ class AgentActivity {
|
|
|
430
432
|
onStartOfSpeech(_ev) {
|
|
431
433
|
this.agentSession._updateUserState("speaking");
|
|
432
434
|
}
|
|
433
|
-
onEndOfSpeech(
|
|
434
|
-
|
|
435
|
+
onEndOfSpeech(ev) {
|
|
436
|
+
let speechEndTime = Date.now();
|
|
437
|
+
if (ev) {
|
|
438
|
+
speechEndTime = speechEndTime - ev.silenceDuration;
|
|
439
|
+
}
|
|
440
|
+
this.agentSession._updateUserState("listening", speechEndTime);
|
|
435
441
|
}
|
|
436
442
|
onVADInferenceDone(ev) {
|
|
437
443
|
var _a, _b;
|
|
@@ -485,6 +491,44 @@ class AgentActivity {
|
|
|
485
491
|
})
|
|
486
492
|
);
|
|
487
493
|
}
|
|
494
|
+
onPreemptiveGeneration(info) {
|
|
495
|
+
if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
|
|
496
|
+
return;
|
|
497
|
+
}
|
|
498
|
+
this.cancelPreemptiveGeneration();
|
|
499
|
+
this.logger.info(
|
|
500
|
+
{
|
|
501
|
+
newTranscript: info.newTranscript,
|
|
502
|
+
transcriptConfidence: info.transcriptConfidence
|
|
503
|
+
},
|
|
504
|
+
"starting preemptive generation"
|
|
505
|
+
);
|
|
506
|
+
const userMessage = import_chat_context.ChatMessage.create({
|
|
507
|
+
role: "user",
|
|
508
|
+
content: info.newTranscript
|
|
509
|
+
});
|
|
510
|
+
const chatCtx = this.agent.chatCtx.copy();
|
|
511
|
+
const speechHandle = this.generateReply({
|
|
512
|
+
userMessage,
|
|
513
|
+
chatCtx,
|
|
514
|
+
scheduleSpeech: false
|
|
515
|
+
});
|
|
516
|
+
this._preemptiveGeneration = {
|
|
517
|
+
speechHandle,
|
|
518
|
+
userMessage,
|
|
519
|
+
info,
|
|
520
|
+
chatCtx: chatCtx.copy(),
|
|
521
|
+
tools: { ...this.tools },
|
|
522
|
+
toolChoice: this.toolChoice,
|
|
523
|
+
createdAt: Date.now()
|
|
524
|
+
};
|
|
525
|
+
}
|
|
526
|
+
cancelPreemptiveGeneration() {
|
|
527
|
+
if (this._preemptiveGeneration !== void 0) {
|
|
528
|
+
this._preemptiveGeneration.speechHandle._cancel();
|
|
529
|
+
this._preemptiveGeneration = void 0;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
488
532
|
createSpeechTask(options) {
|
|
489
533
|
const { task, ownedSpeechHandle } = options;
|
|
490
534
|
this.speechTasks.add(task);
|
|
@@ -506,10 +550,12 @@ class AgentActivity {
|
|
|
506
550
|
}
|
|
507
551
|
async onEndOfTurn(info) {
|
|
508
552
|
if (this.draining) {
|
|
553
|
+
this.cancelPreemptiveGeneration();
|
|
509
554
|
this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
|
|
510
555
|
return true;
|
|
511
556
|
}
|
|
512
557
|
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0 && info.newTranscript.split(" ").length < this.agentSession.options.minInterruptionWords) {
|
|
558
|
+
this.cancelPreemptiveGeneration();
|
|
513
559
|
this.logger.info("skipping user input, new_transcript is too short");
|
|
514
560
|
return false;
|
|
515
561
|
}
|
|
@@ -563,7 +609,8 @@ class AgentActivity {
|
|
|
563
609
|
chatCtx,
|
|
564
610
|
instructions: defaultInstructions,
|
|
565
611
|
toolChoice: defaultToolChoice,
|
|
566
|
-
allowInterruptions: defaultAllowInterruptions
|
|
612
|
+
allowInterruptions: defaultAllowInterruptions,
|
|
613
|
+
scheduleSpeech = true
|
|
567
614
|
} = options;
|
|
568
615
|
let instructions = defaultInstructions;
|
|
569
616
|
let toolChoice = defaultToolChoice;
|
|
@@ -636,7 +683,9 @@ ${instructions}` : instructions,
|
|
|
636
683
|
});
|
|
637
684
|
task.finally(() => this.onPipelineReplyDone());
|
|
638
685
|
}
|
|
639
|
-
|
|
686
|
+
if (scheduleSpeech) {
|
|
687
|
+
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
688
|
+
}
|
|
640
689
|
return handle;
|
|
641
690
|
}
|
|
642
691
|
interrupt() {
|
|
@@ -709,13 +758,36 @@ ${instructions}` : instructions,
|
|
|
709
758
|
} else if (this.llm === void 0) {
|
|
710
759
|
return;
|
|
711
760
|
}
|
|
712
|
-
|
|
761
|
+
let speechHandle;
|
|
762
|
+
if (this._preemptiveGeneration !== void 0) {
|
|
763
|
+
const preemptive = this._preemptiveGeneration;
|
|
764
|
+
if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && (0, import_tool_context.isSameToolContext)(preemptive.tools, this.tools) && (0, import_tool_context.isSameToolChoice)(preemptive.toolChoice, this.toolChoice)) {
|
|
765
|
+
speechHandle = preemptive.speechHandle;
|
|
766
|
+
this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
767
|
+
this.logger.debug(
|
|
768
|
+
{
|
|
769
|
+
preemptiveLeadTime: Date.now() - preemptive.createdAt
|
|
770
|
+
},
|
|
771
|
+
"using preemptive generation"
|
|
772
|
+
);
|
|
773
|
+
} else {
|
|
774
|
+
this.logger.warn(
|
|
775
|
+
"preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`"
|
|
776
|
+
);
|
|
777
|
+
preemptive.speechHandle._cancel();
|
|
778
|
+
}
|
|
779
|
+
this._preemptiveGeneration = void 0;
|
|
780
|
+
}
|
|
781
|
+
if (speechHandle === void 0) {
|
|
782
|
+
speechHandle = this.generateReply({ userMessage, chatCtx });
|
|
783
|
+
}
|
|
713
784
|
const eouMetrics = {
|
|
714
785
|
type: "eou_metrics",
|
|
715
786
|
timestamp: Date.now(),
|
|
716
787
|
endOfUtteranceDelayMs: info.endOfUtteranceDelay,
|
|
717
788
|
transcriptionDelayMs: info.transcriptionDelay,
|
|
718
789
|
onUserTurnCompletedDelayMs: callbackDuration,
|
|
790
|
+
lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
|
|
719
791
|
speechId: speechHandle.id
|
|
720
792
|
};
|
|
721
793
|
this.agentSession.emit(
|
|
@@ -823,8 +895,6 @@ ${instructions}` : instructions,
|
|
|
823
895
|
chatCtx = chatCtx.copy();
|
|
824
896
|
if (newMessage) {
|
|
825
897
|
chatCtx.insert(newMessage);
|
|
826
|
-
this.agent._chatCtx.insert(newMessage);
|
|
827
|
-
this.agentSession._conversationItemAdded(newMessage);
|
|
828
898
|
}
|
|
829
899
|
if (instructions) {
|
|
830
900
|
try {
|
|
@@ -837,7 +907,6 @@ ${instructions}` : instructions,
|
|
|
837
907
|
this.logger.error({ error: e }, "error occurred during updateInstructions");
|
|
838
908
|
}
|
|
839
909
|
}
|
|
840
|
-
this.agentSession._updateAgentState("thinking");
|
|
841
910
|
const tasks = [];
|
|
842
911
|
const [llmTask, llmGenData] = (0, import_generation.performLLMInference)(
|
|
843
912
|
// preserve `this` context in llmNode
|
|
@@ -861,6 +930,10 @@ ${instructions}` : instructions,
|
|
|
861
930
|
tasks.push(ttsTask);
|
|
862
931
|
}
|
|
863
932
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
933
|
+
if (newMessage && speechHandle.scheduled) {
|
|
934
|
+
this.agent._chatCtx.insert(newMessage);
|
|
935
|
+
this.agentSession._conversationItemAdded(newMessage);
|
|
936
|
+
}
|
|
864
937
|
if (speechHandle.interrupted) {
|
|
865
938
|
replyAbortController.abort();
|
|
866
939
|
await (0, import_utils.cancelAndWait)(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
@@ -1442,6 +1515,7 @@ ${instructions}` : instructions,
|
|
|
1442
1515
|
const unlock = await this.lock.lock();
|
|
1443
1516
|
try {
|
|
1444
1517
|
if (this._draining) return;
|
|
1518
|
+
this.cancelPreemptiveGeneration();
|
|
1445
1519
|
this.createSpeechTask({
|
|
1446
1520
|
task: import_utils.Task.from(() => this.agent.onExit()),
|
|
1447
1521
|
name: "AgentActivity_onExit"
|
|
@@ -1460,6 +1534,7 @@ ${instructions}` : instructions,
|
|
|
1460
1534
|
if (!this._draining) {
|
|
1461
1535
|
this.logger.warn("task closing without draining");
|
|
1462
1536
|
}
|
|
1537
|
+
this.cancelPreemptiveGeneration();
|
|
1463
1538
|
if (this.llm instanceof import_llm.LLM) {
|
|
1464
1539
|
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1465
1540
|
}
|