@livekit/agents 1.0.38 → 1.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/inference/llm.cjs +7 -3
  2. package/dist/inference/llm.cjs.map +1 -1
  3. package/dist/inference/llm.d.cts +5 -6
  4. package/dist/inference/llm.d.ts +5 -6
  5. package/dist/inference/llm.d.ts.map +1 -1
  6. package/dist/inference/llm.js +7 -3
  7. package/dist/inference/llm.js.map +1 -1
  8. package/dist/inference/stt.cjs.map +1 -1
  9. package/dist/inference/stt.d.cts +5 -4
  10. package/dist/inference/stt.d.ts +5 -4
  11. package/dist/inference/stt.d.ts.map +1 -1
  12. package/dist/inference/stt.js.map +1 -1
  13. package/dist/inference/tts.cjs.map +1 -1
  14. package/dist/inference/tts.d.cts +10 -7
  15. package/dist/inference/tts.d.ts +10 -7
  16. package/dist/inference/tts.d.ts.map +1 -1
  17. package/dist/inference/tts.js.map +1 -1
  18. package/dist/stt/stream_adapter.cjs +9 -1
  19. package/dist/stt/stream_adapter.cjs.map +1 -1
  20. package/dist/stt/stream_adapter.d.ts.map +1 -1
  21. package/dist/stt/stream_adapter.js +9 -1
  22. package/dist/stt/stream_adapter.js.map +1 -1
  23. package/dist/utils.cjs +5 -0
  24. package/dist/utils.cjs.map +1 -1
  25. package/dist/utils.d.cts +8 -0
  26. package/dist/utils.d.ts +8 -0
  27. package/dist/utils.d.ts.map +1 -1
  28. package/dist/utils.js +4 -0
  29. package/dist/utils.js.map +1 -1
  30. package/dist/voice/agent.cjs +1 -2
  31. package/dist/voice/agent.cjs.map +1 -1
  32. package/dist/voice/agent.js +1 -2
  33. package/dist/voice/agent.js.map +1 -1
  34. package/dist/voice/agent_activity.cjs +23 -14
  35. package/dist/voice/agent_activity.cjs.map +1 -1
  36. package/dist/voice/agent_activity.d.cts +1 -0
  37. package/dist/voice/agent_activity.d.ts +1 -0
  38. package/dist/voice/agent_activity.d.ts.map +1 -1
  39. package/dist/voice/agent_activity.js +23 -14
  40. package/dist/voice/agent_activity.js.map +1 -1
  41. package/package.json +2 -2
  42. package/src/inference/llm.ts +20 -15
  43. package/src/inference/stt.ts +9 -7
  44. package/src/inference/tts.ts +36 -16
  45. package/src/stt/stream_adapter.ts +12 -1
  46. package/src/utils.ts +14 -0
  47. package/src/voice/agent.ts +2 -2
  48. package/src/voice/agent_activity.ts +36 -15
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/voice/agent.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2025 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { AsyncLocalStorage } from 'node:async_hooks';\nimport { ReadableStream } from 'node:stream/web';\nimport {\n LLM as InferenceLLM,\n STT as InferenceSTT,\n TTS as InferenceTTS,\n type LLMModels,\n type STTModelString,\n type TTSModelString,\n} from '../inference/index.js';\nimport { ReadonlyChatContext } from '../llm/chat_context.js';\nimport type { ChatMessage, FunctionCall, RealtimeModel } from '../llm/index.js';\nimport {\n type ChatChunk,\n ChatContext,\n LLM,\n type ToolChoice,\n type ToolContext,\n} from '../llm/index.js';\nimport type { STT, SpeechEvent } from '../stt/index.js';\nimport { StreamAdapter as STTStreamAdapter } from '../stt/index.js';\nimport { SentenceTokenizer as BasicSentenceTokenizer } from '../tokenize/basic/index.js';\nimport type { TTS } from '../tts/index.js';\nimport { SynthesizeStream, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';\nimport type { VAD } from '../vad.js';\nimport type { AgentActivity } from './agent_activity.js';\nimport type { AgentSession, TurnDetectionMode } from './agent_session.js';\n\nexport const asyncLocalStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>();\nexport const STOP_RESPONSE_SYMBOL = Symbol('StopResponse');\n\nexport class StopResponse extends Error {\n constructor() {\n super();\n this.name = 'StopResponse';\n\n Object.defineProperty(this, STOP_RESPONSE_SYMBOL, {\n value: true,\n });\n }\n}\n\nexport function isStopResponse(value: unknown): value is StopResponse {\n return (\n value !== undefined &&\n value !== null &&\n typeof value === 'object' &&\n STOP_RESPONSE_SYMBOL in value\n );\n}\n\nexport interface ModelSettings {\n /** The tool choice to use when calling the LLM. */\n toolChoice?: ToolChoice;\n}\n\nexport interface AgentOptions<UserData> {\n id?: string;\n instructions: string;\n chatCtx?: ChatContext;\n tools?: ToolContext<UserData>;\n turnDetection?: TurnDetectionMode;\n stt?: STT | STTModelString;\n vad?: VAD;\n llm?: LLM | RealtimeModel | LLMModels;\n tts?: TTS | TTSModelString;\n allowInterruptions?: boolean;\n minConsecutiveSpeechDelay?: number;\n}\n\nexport class Agent<UserData = any> {\n private _id: string;\n private turnDetection?: TurnDetectionMode;\n private _stt?: STT;\n private _vad?: VAD;\n private _llm?: LLM | RealtimeModel;\n private _tts?: TTS;\n\n /** @internal */\n _agentActivity?: AgentActivity;\n\n /** @internal */\n _chatCtx: ChatContext;\n\n /** @internal */\n _instructions: string;\n\n /** @internal */\n _tools?: ToolContext<UserData>;\n\n constructor({\n id,\n instructions,\n chatCtx,\n tools,\n turnDetection,\n stt,\n vad,\n llm,\n tts,\n }: AgentOptions<UserData>) {\n if (id) {\n this._id = id;\n } else {\n // Convert class name to snake_case\n const className = this.constructor.name;\n if (className === 'Agent') {\n this._id = 'default_agent';\n } else {\n this._id = className\n .replace(/([A-Z])/g, '_$1')\n .toLowerCase()\n .replace(/^_/, '');\n }\n }\n\n this._instructions = instructions;\n this._tools = { ...tools };\n this._chatCtx = chatCtx\n ? chatCtx.copy({\n toolCtx: this._tools,\n })\n : ChatContext.empty();\n\n this.turnDetection = turnDetection;\n this._vad = vad;\n\n if (typeof stt === 'string') {\n this._stt = InferenceSTT.fromModelString(stt);\n } else {\n this._stt = stt;\n }\n\n if (typeof llm === 'string') {\n this._llm = InferenceLLM.fromModelString(llm);\n } else {\n this._llm = llm;\n }\n\n if (typeof tts === 'string') {\n this._tts = InferenceTTS.fromModelString(tts);\n } else {\n this._tts = tts;\n }\n\n this._agentActivity = undefined;\n }\n\n get vad(): VAD | undefined {\n return this._vad;\n }\n\n get stt(): STT | undefined {\n return this._stt;\n }\n\n get llm(): LLM | RealtimeModel | undefined {\n return this._llm;\n }\n\n get tts(): TTS | undefined {\n return this._tts;\n }\n\n get chatCtx(): ReadonlyChatContext {\n return new ReadonlyChatContext(this._chatCtx.items);\n }\n\n get id(): string {\n return this._id;\n }\n\n get instructions(): string {\n return this._instructions;\n }\n\n get toolCtx(): ToolContext<UserData> {\n return { ...this._tools };\n }\n\n get session(): AgentSession<UserData> {\n return this.getActivityOrThrow().agentSession as AgentSession<UserData>;\n }\n\n async onEnter(): Promise<void> {}\n\n async onExit(): Promise<void> {}\n\n async transcriptionNode(\n text: ReadableStream<string>,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<string> | null> {\n return Agent.default.transcriptionNode(this, text, modelSettings);\n }\n\n async onUserTurnCompleted(_chatCtx: ChatContext, _newMessage: ChatMessage): Promise<void> {}\n\n async sttNode(\n audio: ReadableStream<AudioFrame>,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<SpeechEvent | string> | null> {\n return Agent.default.sttNode(this, audio, modelSettings);\n }\n\n async llmNode(\n chatCtx: ChatContext,\n toolCtx: ToolContext,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<ChatChunk | string> | null> {\n return Agent.default.llmNode(this, chatCtx, toolCtx, modelSettings);\n }\n\n async ttsNode(\n text: ReadableStream<string>,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<AudioFrame> | null> {\n return Agent.default.ttsNode(this, text, modelSettings);\n }\n\n async realtimeAudioOutputNode(\n audio: ReadableStream<AudioFrame>,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<AudioFrame> | null> {\n return Agent.default.realtimeAudioOutputNode(this, audio, modelSettings);\n }\n\n // realtime_audio_output_node\n\n getActivityOrThrow(): AgentActivity {\n if (!this._agentActivity) {\n throw new Error('Agent activity not found');\n }\n return this._agentActivity;\n }\n\n async updateChatCtx(chatCtx: ChatContext): Promise<void> {\n if (!this._agentActivity) {\n this._chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });\n return;\n }\n\n this._agentActivity.updateChatCtx(chatCtx);\n }\n\n static default = {\n async sttNode(\n agent: Agent,\n audio: ReadableStream<AudioFrame>,\n _modelSettings: ModelSettings,\n ): Promise<ReadableStream<SpeechEvent | string> | null> {\n const activity = agent.getActivityOrThrow();\n if (!activity.stt) {\n throw new Error('sttNode called but no STT node is available');\n }\n\n let wrapped_stt = activity.stt;\n\n if (!wrapped_stt.capabilities.streaming) {\n const vad = agent.vad || activity.vad;\n if (!vad) {\n throw new Error(\n 'STT does not support streaming, add a VAD to the AgentTask/VoiceAgent to enable streaming',\n );\n }\n wrapped_stt = new STTStreamAdapter(wrapped_stt, vad);\n }\n\n const connOptions = activity.agentSession.connOptions.sttConnOptions;\n const stream = wrapped_stt.stream({ connOptions });\n\n // Set startTimeOffset to provide linear timestamps across reconnections\n const audioInputStartedAt =\n activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available\n activity.agentSession._startedAt ?? // Fallback to session start time\n Date.now(); // Fallback to current time\n\n stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;\n\n stream.updateInputStream(audio);\n\n let cleaned = false;\n const cleanup = () => {\n if (cleaned) return;\n cleaned = true;\n stream.detachInputStream();\n stream.close();\n };\n\n return new ReadableStream({\n async start(controller) {\n try {\n for await (const event of stream) {\n controller.enqueue(event);\n }\n controller.close();\n } finally {\n // Always clean up the STT stream, whether it ends naturally or is cancelled\n cleanup();\n }\n },\n cancel() {\n cleanup();\n },\n });\n },\n\n async llmNode(\n agent: Agent,\n chatCtx: ChatContext,\n toolCtx: ToolContext,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<ChatChunk | string> | null> {\n const activity = agent.getActivityOrThrow();\n if (!activity.llm) {\n throw new Error('llmNode called but no LLM node is available');\n }\n\n if (!(activity.llm instanceof LLM)) {\n throw new Error(\n 'llmNode should only be used with LLM (non-multimodal/realtime APIs) nodes',\n );\n }\n\n // TODO(brian): make parallelToolCalls configurable\n const { toolChoice } = modelSettings;\n const connOptions = activity.agentSession.connOptions.llmConnOptions;\n\n const stream = activity.llm.chat({\n chatCtx,\n toolCtx,\n toolChoice,\n connOptions,\n parallelToolCalls: true,\n });\n\n let cleaned = false;\n const cleanup = () => {\n if (cleaned) return;\n cleaned = true;\n stream.close();\n };\n\n return new ReadableStream({\n async start(controller) {\n try {\n for await (const chunk of stream) {\n controller.enqueue(chunk);\n }\n controller.close();\n } finally {\n cleanup();\n }\n },\n cancel() {\n cleanup();\n },\n });\n },\n\n async ttsNode(\n agent: Agent,\n text: ReadableStream<string>,\n _modelSettings: ModelSettings,\n ): Promise<ReadableStream<AudioFrame> | null> {\n const activity = agent.getActivityOrThrow();\n if (!activity.tts) {\n throw new Error('ttsNode called but no TTS node is available');\n }\n\n let wrapped_tts = activity.tts;\n\n if (!activity.tts.capabilities.streaming) {\n wrapped_tts = new TTSStreamAdapter(wrapped_tts, new BasicSentenceTokenizer());\n }\n\n const connOptions = activity.agentSession.connOptions.ttsConnOptions;\n const stream = wrapped_tts.stream({ connOptions });\n stream.updateInputStream(text);\n\n let cleaned = false;\n const cleanup = () => {\n if (cleaned) return;\n cleaned = true;\n stream.close();\n };\n\n return new ReadableStream({\n async start(controller) {\n try {\n for await (const chunk of stream) {\n if (chunk === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n controller.enqueue(chunk.frame);\n }\n controller.close();\n } finally {\n cleanup();\n }\n },\n cancel() {\n cleanup();\n },\n });\n },\n\n async transcriptionNode(\n agent: Agent,\n text: ReadableStream<string>,\n _modelSettings: ModelSettings,\n ): Promise<ReadableStream<string> | null> {\n return text;\n },\n\n async realtimeAudioOutputNode(\n _agent: Agent,\n audio: ReadableStream<AudioFrame>,\n _modelSettings: ModelSettings,\n ): Promise<ReadableStream<AudioFrame> | null> {\n return audio;\n },\n };\n}\n"],"mappings":"AAIA,SAAS,yBAAyB;AAClC,SAAS,sBAAsB;AAC/B;AAAA,EACE,OAAO;AAAA,EACP,OAAO;AAAA,EACP,OAAO;AAAA,OAIF;AACP,SAAS,2BAA2B;AAEpC;AAAA,EAEE;AAAA,EACA;AAAA,OAGK;AAEP,SAAS,iBAAiB,wBAAwB;AAClD,SAAS,qBAAqB,8BAA8B;AAE5D,SAAS,kBAAkB,iBAAiB,wBAAwB;AAK7D,MAAM,oBAAoB,IAAI,kBAAmD;AACjF,MAAM,uBAAuB,OAAO,cAAc;AAElD,MAAM,qBAAqB,MAAM;AAAA,EACtC,cAAc;AACZ,UAAM;AACN,SAAK,OAAO;AAEZ,WAAO,eAAe,MAAM,sBAAsB;AAAA,MAChD,OAAO;AAAA,IACT,CAAC;AAAA,EACH;AACF;AAEO,SAAS,eAAe,OAAuC;AACpE,SACE,UAAU,UACV,UAAU,QACV,OAAO,UAAU,YACjB,wBAAwB;AAE5B;AAqBO,MAAM,MAAsB;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGR;AAAA;AAAA,EAGA;AAAA;AAAA,EAGA;AAAA;AAAA,EAGA;AAAA,EAEA,YAAY;AAAA,IACV;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,GAA2B;AACzB,QAAI,IAAI;AACN,WAAK,MAAM;AAAA,IACb,OAAO;AAEL,YAAM,YAAY,KAAK,YAAY;AACnC,UAAI,cAAc,SAAS;AACzB,aAAK,MAAM;AAAA,MACb,OAAO;AACL,aAAK,MAAM,UACR,QAAQ,YAAY,KAAK,EACzB,YAAY,EACZ,QAAQ,MAAM,EAAE;AAAA,MACrB;AAAA,IACF;AAEA,SAAK,gBAAgB;AACrB,SAAK,SAAS,EAAE,GAAG,MAAM;AACzB,SAAK,WAAW,UACZ,QAAQ,KAAK;AAAA,MACX,SAAS,KAAK;AAAA,IAChB,CAAC,IACD,YAAY,MAAM;AAEtB,SAAK,gBAAgB;AACrB,SAAK,OAAO;AAEZ,QAAI,OAAO,QAAQ,UAAU;AAC3B,WAAK,OAAO,aAAa,gBAAgB,GAAG;AAAA,IAC9C,OAAO;AACL,WAAK,OAAO;AAAA,IACd;AAEA,QAAI,OAAO,QAAQ,UAAU;AAC3B,WAAK,OAAO,aAAa,gBAAgB,GAAG;AAAA,IAC9C,OAAO;AACL,WAAK,OAAO;AAAA,IACd;AAEA,QAAI,OAAO,QAAQ,UAAU;AAC3B,WAAK,OAAO,aAAa,gBAAgB,GAAG;AAAA,IAC9C,OAAO;AACL,WAAK,OAAO;AAAA,IACd;AAEA,SAAK,iBAAiB;AAAA,EACxB;AAAA,EAEA,IAAI,MAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAuC;AACzC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,UAA+B;AACjC,WAAO,IAAI,oBAAoB,KAAK,SAAS,KAAK;AAAA,EACpD;AAAA,EAEA,IAAI,KAAa;AACf,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,UAAiC;AACnC,WAAO,EAAE,GAAG,KAAK,OAAO;AAAA,EAC1B;AAAA,EAEA,IAAI,UAAkC;AACpC,WAAO,KAAK,mBAAmB,EAAE;AAAA,EACnC;AAAA,EAEA,MAAM,UAAyB;AAAA,EAAC;AAAA,EAEhC,MAAM,SAAwB;AAAA,EAAC;AAAA,EAE/B,MAAM,kBACJ,MACA,eACwC;AACxC,WAAO,MAAM,QAAQ,kBAAkB,MAAM,MAAM,aAAa;AAAA,EAClE;AAAA,EAEA,MAAM,oBAAoB,UAAuB,aAAyC;AAAA,EAAC;AAAA,EAE3F,MAAM,QACJ,OACA,eACsD;AACtD,WAAO,MAAM,QAAQ,QAAQ,MAAM,OAAO,aAAa;AAAA,EACzD;AAAA,EAEA,MAAM,QACJ,SACA,SACA,eACoD;AACpD,WAAO,MAAM,QAAQ,QAAQ,MAAM,SAAS,SAAS,aAAa;AAAA,EACpE;AAAA,EAEA,MAAM,QACJ,MACA,eAC4C;AAC5C,WAAO,MAAM,QAAQ,QAAQ,MAAM,MAAM,aAAa;AAAA,EACxD;AAAA,EAEA,MAAM,wBACJ,OACA,eAC4C;AAC5C,WAAO,MAAM,QAAQ,wBAAwB,MAAM,OAAO,aAAa;AAAA,EACzE;AAAA;AAAA,EAIA,qBAAoC;AAClC,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI,MAAM,0BAA0B;AAAA,IAC5C;AACA,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,cAAc,SAAqC;AACvD,QAAI,CAAC,KAAK,gBAAgB;AACxB,WAAK,WAAW,QAAQ,KAAK,EAAE,SAAS,KAAK,QAAQ,CAAC;AACtD;AAAA,IACF;AAEA,SAAK,eAAe,cAAc,OAAO;AAAA,EAC3C;AAAA,EAEA,OAAO,UAAU;AAAA,IACf,MAAM,QACJ,OACA,OACA,gBACsD;AA7P5D;AA8PM,YAAM,WAAW,MAAM,mBAAmB;AAC1C,UAAI,CAAC,SAAS,KAAK;AACjB,cAAM,IAAI,MAAM,6CAA6C;AAAA,MAC/D;AAEA,UAAI,cAAc,SAAS;AAE3B,UAAI,CAAC,YAAY,aAAa,WAAW;AACvC,cAAM,MAAM,MAAM,OAAO,SAAS;AAClC,YAAI,CAAC,KAAK;AACR,gBAAM,IAAI;AAAA,YACR;AAAA,UACF;AAAA,QACF;AACA,sBAAc,IAAI,iBAAiB,aAAa,GAAG;AAAA,MACrD;AAEA,YAAM,cAAc,SAAS,aAAa,YAAY;AACtD,YAAM,SAAS,YAAY,OAAO,EAAE,YAAY,CAAC;AAGjD,YAAM,wBACJ,cAAS,aAAa,gBAAtB,mBAAmC;AAAA,MACnC,SAAS,aAAa;AAAA,MACtB,KAAK,IAAI;AAEX,aAAO,mBAAmB,KAAK,IAAI,IAAI,uBAAuB;AAE9D,aAAO,kBAAkB,KAAK;AAE9B,UAAI,UAAU;AACd,YAAM,UAAU,MAAM;AACpB,YAAI,QAAS;AACb,kBAAU;AACV,eAAO,kBAAkB;AACzB,eAAO,MAAM;AAAA,MACf;AAEA,aAAO,IAAI,eAAe;AAAA,QACxB,MAAM,MAAM,YAAY;AACtB,cAAI;AACF,6BAAiB,SAAS,QAAQ;AAChC,yBAAW,QAAQ,KAAK;AAAA,YAC1B;AACA,uBAAW,MAAM;AAAA,UACnB,UAAE;AAEA,oBAAQ;AAAA,UACV;AAAA,QACF;AAAA,QACA,SAAS;AACP,kBAAQ;AAAA,QACV;AAAA,MACF,CAAC;AAAA,IACH;AAAA,IAEA,MAAM,QACJ,OACA,SACA,SACA,eACoD;AACpD,YAAM,WAAW,MAAM,mBAAmB;AAC1C,UAAI,CAAC,SAAS,KAAK;AACjB,cAAM,IAAI,MAAM,6CAA6C;AAAA,MAC/D;AAEA,UAAI,EAAE,SAAS,eAAe,MAAM;AAClC,cAAM,IAAI;AAAA,UACR;AAAA,QACF;AAAA,MACF;AAGA,YAAM,EAAE,WAAW,IAAI;AACvB,YAAM,cAAc,SAAS,aAAa,YAAY;AAEtD,YAAM,SAAS,SAAS,IAAI,KAAK;AAAA,QAC/B;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,mBAAmB;AAAA,MACrB,CAAC;AAED,UAAI,UAAU;AACd,YAAM,UAAU,MAAM;AACpB,YAAI,QAAS;AACb,kBAAU;AACV,eAAO,MAAM;AAAA,MACf;AAEA,aAAO,IAAI,eAAe;AAAA,QACxB,MAAM,MAAM,YAAY;AACtB,cAAI;AACF,6BAAiB,SAAS,QAAQ;AAChC,yBAAW,QAAQ,KAAK;AAAA,YAC1B;AACA,uBAAW,MAAM;AAAA,UACnB,UAAE;AACA,oBAAQ;AAAA,UACV;AAAA,QACF;AAAA,QACA,SAAS;AACP,kBAAQ;AAAA,QACV;AAAA,MACF,CAAC;AAAA,IACH;AAAA,IAEA,MAAM,QACJ,OACA,MACA,gBAC4C;AAC5C,YAAM,WAAW,MAAM,mBAAmB;AAC1C,UAAI,CAAC,SAAS,KAAK;AACjB,cAAM,IAAI,MAAM,6CAA6C;AAAA,MAC/D;AAEA,UAAI,cAAc,SAAS;AAE3B,UAAI,CAAC,SAAS,IAAI,aAAa,WAAW;AACxC,sBAAc,IAAI,iBAAiB,aAAa,IAAI,uBAAuB,CAAC;AAAA,MAC9E;AAEA,YAAM,cAAc,SAAS,aAAa,YAAY;AACtD,YAAM,SAAS,YAAY,OAAO,EAAE,YAAY,CAAC;AACjD,aAAO,kBAAkB,IAAI;AAE7B,UAAI,UAAU;AACd,YAAM,UAAU,MAAM;AACpB,YAAI,QAAS;AACb,kBAAU;AACV,eAAO,MAAM;AAAA,MACf;AAEA,aAAO,IAAI,eAAe;AAAA,QACxB,MAAM,MAAM,YAAY;AACtB,cAAI;AACF,6BAAiB,SAAS,QAAQ;AAChC,kBAAI,UAAU,iBAAiB,eAAe;AAC5C;AAAA,cACF;AACA,yBAAW,QAAQ,MAAM,KAAK;AAAA,YAChC;AACA,uBAAW,MAAM;AAAA,UACnB,UAAE;AACA,oBAAQ;AAAA,UACV;AAAA,QACF;AAAA,QACA,SAAS;AACP,kBAAQ;AAAA,QACV;AAAA,MACF,CAAC;AAAA,IACH;AAAA,IAEA,MAAM,kBACJ,OACA,MACA,gBACwC;AACxC,aAAO;AAAA,IACT;AAAA,IAEA,MAAM,wBACJ,QACA,OACA,gBAC4C;AAC5C,aAAO;AAAA,IACT;AAAA,EACF;AACF;","names":[]}
1
+ {"version":3,"sources":["../../src/voice/agent.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2025 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { AsyncLocalStorage } from 'node:async_hooks';\nimport { ReadableStream } from 'node:stream/web';\nimport {\n LLM as InferenceLLM,\n STT as InferenceSTT,\n TTS as InferenceTTS,\n type LLMModels,\n type STTModelString,\n type TTSModelString,\n} from '../inference/index.js';\nimport { ReadonlyChatContext } from '../llm/chat_context.js';\nimport type { ChatMessage, FunctionCall, RealtimeModel } from '../llm/index.js';\nimport {\n type ChatChunk,\n ChatContext,\n LLM,\n type ToolChoice,\n type ToolContext,\n} from '../llm/index.js';\nimport type { STT, SpeechEvent } from '../stt/index.js';\nimport { StreamAdapter as STTStreamAdapter } from '../stt/index.js';\nimport { SentenceTokenizer as BasicSentenceTokenizer } from '../tokenize/basic/index.js';\nimport type { TTS } from '../tts/index.js';\nimport { SynthesizeStream, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';\nimport type { VAD } from '../vad.js';\nimport type { AgentActivity } from './agent_activity.js';\nimport type { AgentSession, TurnDetectionMode } from './agent_session.js';\n\nexport const asyncLocalStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>();\nexport const STOP_RESPONSE_SYMBOL = Symbol('StopResponse');\n\nexport class StopResponse extends Error {\n constructor() {\n super();\n this.name = 'StopResponse';\n\n Object.defineProperty(this, STOP_RESPONSE_SYMBOL, {\n value: true,\n });\n }\n}\n\nexport function isStopResponse(value: unknown): value is StopResponse {\n return (\n value !== undefined &&\n value !== null &&\n typeof value === 'object' &&\n STOP_RESPONSE_SYMBOL in value\n );\n}\n\nexport interface ModelSettings {\n /** The tool choice to use when calling the LLM. */\n toolChoice?: ToolChoice;\n}\n\nexport interface AgentOptions<UserData> {\n id?: string;\n instructions: string;\n chatCtx?: ChatContext;\n tools?: ToolContext<UserData>;\n turnDetection?: TurnDetectionMode;\n stt?: STT | STTModelString;\n vad?: VAD;\n llm?: LLM | RealtimeModel | LLMModels;\n tts?: TTS | TTSModelString;\n allowInterruptions?: boolean;\n minConsecutiveSpeechDelay?: number;\n}\n\nexport class Agent<UserData = any> {\n private _id: string;\n private turnDetection?: TurnDetectionMode;\n private _stt?: STT;\n private _vad?: VAD;\n private _llm?: LLM | RealtimeModel;\n private _tts?: TTS;\n\n /** @internal */\n _agentActivity?: AgentActivity;\n\n /** @internal */\n _chatCtx: ChatContext;\n\n /** @internal */\n _instructions: string;\n\n /** @internal */\n _tools?: ToolContext<UserData>;\n\n constructor({\n id,\n instructions,\n chatCtx,\n tools,\n turnDetection,\n stt,\n vad,\n llm,\n tts,\n }: AgentOptions<UserData>) {\n if (id) {\n this._id = id;\n } else {\n // Convert class name to snake_case\n const className = this.constructor.name;\n if (className === 'Agent') {\n this._id = 'default_agent';\n } else {\n this._id = className\n .replace(/([A-Z])/g, '_$1')\n .toLowerCase()\n .replace(/^_/, '');\n }\n }\n\n this._instructions = instructions;\n this._tools = { ...tools };\n this._chatCtx = chatCtx\n ? chatCtx.copy({\n toolCtx: this._tools,\n })\n : ChatContext.empty();\n\n this.turnDetection = turnDetection;\n this._vad = vad;\n\n if (typeof stt === 'string') {\n this._stt = InferenceSTT.fromModelString(stt);\n } else {\n this._stt = stt;\n }\n\n if (typeof llm === 'string') {\n this._llm = InferenceLLM.fromModelString(llm);\n } else {\n this._llm = llm;\n }\n\n if (typeof tts === 'string') {\n this._tts = InferenceTTS.fromModelString(tts);\n } else {\n this._tts = tts;\n }\n\n this._agentActivity = undefined;\n }\n\n get vad(): VAD | undefined {\n return this._vad;\n }\n\n get stt(): STT | undefined {\n return this._stt;\n }\n\n get llm(): LLM | RealtimeModel | undefined {\n return this._llm;\n }\n\n get tts(): TTS | undefined {\n return this._tts;\n }\n\n get chatCtx(): ReadonlyChatContext {\n return new ReadonlyChatContext(this._chatCtx.items);\n }\n\n get id(): string {\n return this._id;\n }\n\n get instructions(): string {\n return this._instructions;\n }\n\n get toolCtx(): ToolContext<UserData> {\n return { ...this._tools };\n }\n\n get session(): AgentSession<UserData> {\n return this.getActivityOrThrow().agentSession as AgentSession<UserData>;\n }\n\n async onEnter(): Promise<void> {}\n\n async onExit(): Promise<void> {}\n\n async transcriptionNode(\n text: ReadableStream<string>,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<string> | null> {\n return Agent.default.transcriptionNode(this, text, modelSettings);\n }\n\n async onUserTurnCompleted(_chatCtx: ChatContext, _newMessage: ChatMessage): Promise<void> {}\n\n async sttNode(\n audio: ReadableStream<AudioFrame>,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<SpeechEvent | string> | null> {\n return Agent.default.sttNode(this, audio, modelSettings);\n }\n\n async llmNode(\n chatCtx: ChatContext,\n toolCtx: ToolContext,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<ChatChunk | string> | null> {\n return Agent.default.llmNode(this, chatCtx, toolCtx, modelSettings);\n }\n\n async ttsNode(\n text: ReadableStream<string>,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<AudioFrame> | null> {\n return Agent.default.ttsNode(this, text, modelSettings);\n }\n\n async realtimeAudioOutputNode(\n audio: ReadableStream<AudioFrame>,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<AudioFrame> | null> {\n return Agent.default.realtimeAudioOutputNode(this, audio, modelSettings);\n }\n\n // realtime_audio_output_node\n\n getActivityOrThrow(): AgentActivity {\n if (!this._agentActivity) {\n throw new Error('Agent activity not found');\n }\n return this._agentActivity;\n }\n\n async updateChatCtx(chatCtx: ChatContext): Promise<void> {\n if (!this._agentActivity) {\n this._chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });\n return;\n }\n\n this._agentActivity.updateChatCtx(chatCtx);\n }\n\n static default = {\n async sttNode(\n agent: Agent,\n audio: ReadableStream<AudioFrame>,\n _modelSettings: ModelSettings,\n ): Promise<ReadableStream<SpeechEvent | string> | null> {\n const activity = agent.getActivityOrThrow();\n if (!activity.stt) {\n throw new Error('sttNode called but no STT node is available');\n }\n\n let wrapped_stt = activity.stt;\n\n if (!wrapped_stt.capabilities.streaming) {\n const vad = agent.vad || activity.vad;\n if (!vad) {\n throw new Error(\n 'STT does not support streaming, add a VAD to the AgentTask/VoiceAgent to enable streaming',\n );\n }\n wrapped_stt = new STTStreamAdapter(wrapped_stt, vad);\n }\n\n const connOptions = activity.agentSession.connOptions.sttConnOptions;\n const stream = wrapped_stt.stream({ connOptions });\n\n // Set startTimeOffset to provide linear timestamps across reconnections\n const audioInputStartedAt =\n activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available\n activity.agentSession._startedAt ?? // Fallback to session start time\n Date.now(); // Fallback to current time\n\n stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;\n\n stream.updateInputStream(audio);\n\n let cleaned = false;\n const cleanup = () => {\n if (cleaned) return;\n cleaned = true;\n stream.detachInputStream();\n stream.close();\n };\n\n return new ReadableStream({\n async start(controller) {\n try {\n for await (const event of stream) {\n controller.enqueue(event);\n }\n controller.close();\n } finally {\n // Always clean up the STT stream, whether it ends naturally or is cancelled\n cleanup();\n }\n },\n cancel() {\n cleanup();\n },\n });\n },\n\n async llmNode(\n agent: Agent,\n chatCtx: ChatContext,\n toolCtx: ToolContext,\n modelSettings: ModelSettings,\n ): Promise<ReadableStream<ChatChunk | string> | null> {\n const activity = agent.getActivityOrThrow();\n if (!activity.llm) {\n throw new Error('llmNode called but no LLM node is available');\n }\n\n if (!(activity.llm instanceof LLM)) {\n throw new Error(\n 'llmNode should only be used with LLM (non-multimodal/realtime APIs) nodes',\n );\n }\n\n const { toolChoice } = modelSettings;\n const connOptions = activity.agentSession.connOptions.llmConnOptions;\n\n // parallelToolCalls is not passed here - it will use the value from LLM's modelOptions\n // This allows users to configure it via: new inference.LLM({ modelOptions: { parallel_tool_calls: false } })\n const stream = activity.llm.chat({\n chatCtx,\n toolCtx,\n toolChoice,\n connOptions,\n });\n\n let cleaned = false;\n const cleanup = () => {\n if (cleaned) return;\n cleaned = true;\n stream.close();\n };\n\n return new ReadableStream({\n async start(controller) {\n try {\n for await (const chunk of stream) {\n controller.enqueue(chunk);\n }\n controller.close();\n } finally {\n cleanup();\n }\n },\n cancel() {\n cleanup();\n },\n });\n },\n\n async ttsNode(\n agent: Agent,\n text: ReadableStream<string>,\n _modelSettings: ModelSettings,\n ): Promise<ReadableStream<AudioFrame> | null> {\n const activity = agent.getActivityOrThrow();\n if (!activity.tts) {\n throw new Error('ttsNode called but no TTS node is available');\n }\n\n let wrapped_tts = activity.tts;\n\n if (!activity.tts.capabilities.streaming) {\n wrapped_tts = new TTSStreamAdapter(wrapped_tts, new BasicSentenceTokenizer());\n }\n\n const connOptions = activity.agentSession.connOptions.ttsConnOptions;\n const stream = wrapped_tts.stream({ connOptions });\n stream.updateInputStream(text);\n\n let cleaned = false;\n const cleanup = () => {\n if (cleaned) return;\n cleaned = true;\n stream.close();\n };\n\n return new ReadableStream({\n async start(controller) {\n try {\n for await (const chunk of stream) {\n if (chunk === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n controller.enqueue(chunk.frame);\n }\n controller.close();\n } finally {\n cleanup();\n }\n },\n cancel() {\n cleanup();\n },\n });\n },\n\n async transcriptionNode(\n agent: Agent,\n text: ReadableStream<string>,\n _modelSettings: ModelSettings,\n ): Promise<ReadableStream<string> | null> {\n return text;\n },\n\n async realtimeAudioOutputNode(\n _agent: Agent,\n audio: ReadableStream<AudioFrame>,\n _modelSettings: ModelSettings,\n ): Promise<ReadableStream<AudioFrame> | null> {\n return audio;\n },\n };\n}\n"],"mappings":"AAIA,SAAS,yBAAyB;AAClC,SAAS,sBAAsB;AAC/B;AAAA,EACE,OAAO;AAAA,EACP,OAAO;AAAA,EACP,OAAO;AAAA,OAIF;AACP,SAAS,2BAA2B;AAEpC;AAAA,EAEE;AAAA,EACA;AAAA,OAGK;AAEP,SAAS,iBAAiB,wBAAwB;AAClD,SAAS,qBAAqB,8BAA8B;AAE5D,SAAS,kBAAkB,iBAAiB,wBAAwB;AAK7D,MAAM,oBAAoB,IAAI,kBAAmD;AACjF,MAAM,uBAAuB,OAAO,cAAc;AAElD,MAAM,qBAAqB,MAAM;AAAA,EACtC,cAAc;AACZ,UAAM;AACN,SAAK,OAAO;AAEZ,WAAO,eAAe,MAAM,sBAAsB;AAAA,MAChD,OAAO;AAAA,IACT,CAAC;AAAA,EACH;AACF;AAEO,SAAS,eAAe,OAAuC;AACpE,SACE,UAAU,UACV,UAAU,QACV,OAAO,UAAU,YACjB,wBAAwB;AAE5B;AAqBO,MAAM,MAAsB;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGR;AAAA;AAAA,EAGA;AAAA;AAAA,EAGA;AAAA;AAAA,EAGA;AAAA,EAEA,YAAY;AAAA,IACV;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,GAA2B;AACzB,QAAI,IAAI;AACN,WAAK,MAAM;AAAA,IACb,OAAO;AAEL,YAAM,YAAY,KAAK,YAAY;AACnC,UAAI,cAAc,SAAS;AACzB,aAAK,MAAM;AAAA,MACb,OAAO;AACL,aAAK,MAAM,UACR,QAAQ,YAAY,KAAK,EACzB,YAAY,EACZ,QAAQ,MAAM,EAAE;AAAA,MACrB;AAAA,IACF;AAEA,SAAK,gBAAgB;AACrB,SAAK,SAAS,EAAE,GAAG,MAAM;AACzB,SAAK,WAAW,UACZ,QAAQ,KAAK;AAAA,MACX,SAAS,KAAK;AAAA,IAChB,CAAC,IACD,YAAY,MAAM;AAEtB,SAAK,gBAAgB;AACrB,SAAK,OAAO;AAEZ,QAAI,OAAO,QAAQ,UAAU;AAC3B,WAAK,OAAO,aAAa,gBAAgB,GAAG;AAAA,IAC9C,OAAO;AACL,WAAK,OAAO;AAAA,IACd;AAEA,QAAI,OAAO,QAAQ,UAAU;AAC3B,WAAK,OAAO,aAAa,gBAAgB,GAAG;AAAA,IAC9C,OAAO;AACL,WAAK,OAAO;AAAA,IACd;AAEA,QAAI,OAAO,QAAQ,UAAU;AAC3B,WAAK,OAAO,aAAa,gBAAgB,GAAG;AAAA,IAC9C,OAAO;AACL,WAAK,OAAO;AAAA,IACd;AAEA,SAAK,iBAAiB;AAAA,EACxB;AAAA,EAEA,IAAI,MAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAuC;AACzC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,UAA+B;AACjC,WAAO,IAAI,oBAAoB,KAAK,SAAS,KAAK;AAAA,EACpD;AAAA,EAEA,IAAI,KAAa;AACf,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,UAAiC;AACnC,WAAO,EAAE,GAAG,KAAK,OAAO;AAAA,EAC1B;AAAA,EAEA,IAAI,UAAkC;AACpC,WAAO,KAAK,mBAAmB,EAAE;AAAA,EACnC;AAAA,EAEA,MAAM,UAAyB;AAAA,EAAC;AAAA,EAEhC,MAAM,SAAwB;AAAA,EAAC;AAAA,EAE/B,MAAM,kBACJ,MACA,eACwC;AACxC,WAAO,MAAM,QAAQ,kBAAkB,MAAM,MAAM,aAAa;AAAA,EAClE;AAAA,EAEA,MAAM,oBAAoB,UAAuB,aAAyC;AAAA,EAAC;AAAA,EAE3F,MAAM,QACJ,OACA,eACsD;AACtD,WAAO,MAAM,QAAQ,QAAQ,MAAM,OAAO,aAAa;AAAA,EACzD;AAAA,EAEA,MAAM,QACJ,SACA,SACA,eACoD;AACpD,WAAO,MAAM,QAAQ,QAAQ,MAAM,SAAS,SAAS,aAAa;AAAA,EACpE;AAAA,EAEA,MAAM,QACJ,MACA,eAC4C;AAC5C,WAAO,MAAM,QAAQ,QAAQ,MAAM,MAAM,aAAa;AAAA,EACxD;AAAA,EAEA,MAAM,wBACJ,OACA,eAC4C;AAC5C,WAAO,MAAM,QAAQ,wBAAwB,MAAM,OAAO,aAAa;AAAA,EACzE;AAAA;AAAA,EAIA,qBAAoC;AAClC,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI,MAAM,0BAA0B;AAAA,IAC5C;AACA,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,cAAc,SAAqC;AACvD,QAAI,CAAC,KAAK,gBAAgB;AACxB,WAAK,WAAW,QAAQ,KAAK,EAAE,SAAS,KAAK,QAAQ,CAAC;AACtD;AAAA,IACF;AAEA,SAAK,eAAe,cAAc,OAAO;AAAA,EAC3C;AAAA,EAEA,OAAO,UAAU;AAAA,IACf,MAAM,QACJ,OACA,OACA,gBACsD;AA7P5D;AA8PM,YAAM,WAAW,MAAM,mBAAmB;AAC1C,UAAI,CAAC,SAAS,KAAK;AACjB,cAAM,IAAI,MAAM,6CAA6C;AAAA,MAC/D;AAEA,UAAI,cAAc,SAAS;AAE3B,UAAI,CAAC,YAAY,aAAa,WAAW;AACvC,cAAM,MAAM,MAAM,OAAO,SAAS;AAClC,YAAI,CAAC,KAAK;AACR,gBAAM,IAAI;AAAA,YACR;AAAA,UACF;AAAA,QACF;AACA,sBAAc,IAAI,iBAAiB,aAAa,GAAG;AAAA,MACrD;AAEA,YAAM,cAAc,SAAS,aAAa,YAAY;AACtD,YAAM,SAAS,YAAY,OAAO,EAAE,YAAY,CAAC;AAGjD,YAAM,wBACJ,cAAS,aAAa,gBAAtB,mBAAmC;AAAA,MACnC,SAAS,aAAa;AAAA,MACtB,KAAK,IAAI;AAEX,aAAO,mBAAmB,KAAK,IAAI,IAAI,uBAAuB;AAE9D,aAAO,kBAAkB,KAAK;AAE9B,UAAI,UAAU;AACd,YAAM,UAAU,MAAM;AACpB,YAAI,QAAS;AACb,kBAAU;AACV,eAAO,kBAAkB;AACzB,eAAO,MAAM;AAAA,MACf;AAEA,aAAO,IAAI,eAAe;AAAA,QACxB,MAAM,MAAM,YAAY;AACtB,cAAI;AACF,6BAAiB,SAAS,QAAQ;AAChC,yBAAW,QAAQ,KAAK;AAAA,YAC1B;AACA,uBAAW,MAAM;AAAA,UACnB,UAAE;AAEA,oBAAQ;AAAA,UACV;AAAA,QACF;AAAA,QACA,SAAS;AACP,kBAAQ;AAAA,QACV;AAAA,MACF,CAAC;AAAA,IACH;AAAA,IAEA,MAAM,QACJ,OACA,SACA,SACA,eACoD;AACpD,YAAM,WAAW,MAAM,mBAAmB;AAC1C,UAAI,CAAC,SAAS,KAAK;AACjB,cAAM,IAAI,MAAM,6CAA6C;AAAA,MAC/D;AAEA,UAAI,EAAE,SAAS,eAAe,MAAM;AAClC,cAAM,IAAI;AAAA,UACR;AAAA,QACF;AAAA,MACF;AAEA,YAAM,EAAE,WAAW,IAAI;AACvB,YAAM,cAAc,SAAS,aAAa,YAAY;AAItD,YAAM,SAAS,SAAS,IAAI,KAAK;AAAA,QAC/B;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,MACF,CAAC;AAED,UAAI,UAAU;AACd,YAAM,UAAU,MAAM;AACpB,YAAI,QAAS;AACb,kBAAU;AACV,eAAO,MAAM;AAAA,MACf;AAEA,aAAO,IAAI,eAAe;AAAA,QACxB,MAAM,MAAM,YAAY;AACtB,cAAI;AACF,6BAAiB,SAAS,QAAQ;AAChC,yBAAW,QAAQ,KAAK;AAAA,YAC1B;AACA,uBAAW,MAAM;AAAA,UACnB,UAAE;AACA,oBAAQ;AAAA,UACV;AAAA,QACF;AAAA,QACA,SAAS;AACP,kBAAQ;AAAA,QACV;AAAA,MACF,CAAC;AAAA,IACH;AAAA,IAEA,MAAM,QACJ,OACA,MACA,gBAC4C;AAC5C,YAAM,WAAW,MAAM,mBAAmB;AAC1C,UAAI,CAAC,SAAS,KAAK;AACjB,cAAM,IAAI,MAAM,6CAA6C;AAAA,MAC/D;AAEA,UAAI,cAAc,SAAS;AAE3B,UAAI,CAAC,SAAS,IAAI,aAAa,WAAW;AACxC,sBAAc,IAAI,iBAAiB,aAAa,IAAI,uBAAuB,CAAC;AAAA,MAC9E;AAEA,YAAM,cAAc,SAAS,aAAa,YAAY;AACtD,YAAM,SAAS,YAAY,OAAO,EAAE,YAAY,CAAC;AACjD,aAAO,kBAAkB,IAAI;AAE7B,UAAI,UAAU;AACd,YAAM,UAAU,MAAM;AACpB,YAAI,QAAS;AACb,kBAAU;AACV,eAAO,MAAM;AAAA,MACf;AAEA,aAAO,IAAI,eAAe;AAAA,QACxB,MAAM,MAAM,YAAY;AACtB,cAAI;AACF,6BAAiB,SAAS,QAAQ;AAChC,kBAAI,UAAU,iBAAiB,eAAe;AAC5C;AAAA,cACF;AACA,yBAAW,QAAQ,MAAM,KAAK;AAAA,YAChC;AACA,uBAAW,MAAM;AAAA,UACnB,UAAE;AACA,oBAAQ;AAAA,UACV;AAAA,QACF;AAAA,QACA,SAAS;AACP,kBAAQ;AAAA,QACV;AAAA,MACF,CAAC;AAAA,IACH;AAAA,IAEA,MAAM,kBACJ,OACA,MACA,gBACwC;AACxC,aAAO;AAAA,IACT;AAAA,IAEA,MAAM,wBACJ,QACA,OACA,gBAC4C;AAC5C,aAAO;AAAA,IACT;AAAA,EACF;AACF;","names":[]}
@@ -122,9 +122,9 @@ class AgentActivity {
122
122
  );
123
123
  this.turnDetectionMode = void 0;
124
124
  }
125
- if (!this.vad && this.stt && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
125
+ if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
126
126
  this.logger.warn(
127
- "VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
127
+ "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
128
128
  );
129
129
  }
130
130
  }
@@ -473,14 +473,16 @@ class AgentActivity {
473
473
  this.agentSession._updateUserState("listening", speechEndTime);
474
474
  }
475
475
  onVADInferenceDone(ev) {
476
- var _a, _b;
477
476
  if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
478
477
  return;
479
478
  }
480
- if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
481
- return;
479
+ if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
480
+ this.interruptByAudioActivity();
482
481
  }
483
- if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
482
+ }
483
+ interruptByAudioActivity() {
484
+ var _a, _b;
485
+ if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
484
486
  return;
485
487
  }
486
488
  if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
@@ -493,7 +495,10 @@ class AgentActivity {
493
495
  }
494
496
  (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
495
497
  if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
496
- this.logger.info({ "speech id": this._currentSpeech.id }, "speech interrupted by VAD");
498
+ this.logger.info(
499
+ { "speech id": this._currentSpeech.id },
500
+ "speech interrupted by audio activity"
501
+ );
497
502
  (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
498
503
  this._currentSpeech.interrupt();
499
504
  }
@@ -511,6 +516,9 @@ class AgentActivity {
511
516
  // TODO(AJS-106): add multi participant support
512
517
  })
513
518
  );
519
+ if (ev.alternatives[0].text) {
520
+ this.interruptByAudioActivity();
521
+ }
514
522
  }
515
523
  onFinalTranscript(ev) {
516
524
  if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
@@ -525,6 +533,9 @@ class AgentActivity {
525
533
  // TODO(AJS-106): add multi participant support
526
534
  })
527
535
  );
536
+ if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
537
+ this.interruptByAudioActivity();
538
+ }
528
539
  }
529
540
  onPreemptiveGeneration(info) {
530
541
  if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
@@ -1457,7 +1468,6 @@ ${instructions}` : instructions,
1457
1468
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1458
1469
  if (audioOutput) {
1459
1470
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1460
- this.agentSession._updateAgentState("listening");
1461
1471
  }
1462
1472
  if (speechHandle.interrupted) {
1463
1473
  this.logger.debug(
@@ -1525,14 +1535,13 @@ ${instructions}` : instructions,
1525
1535
  this.agentSession._conversationItemAdded(message);
1526
1536
  }
1527
1537
  speechHandle._markGenerationDone();
1528
- toolOutput.firstToolStartedFuture.await.finally(() => {
1529
- this.agentSession._updateAgentState("thinking");
1530
- });
1531
1538
  await executeToolsTask.result;
1539
+ if (toolOutput.output.length > 0) {
1540
+ this.agentSession._updateAgentState("thinking");
1541
+ } else if (this.agentSession.agentState === "speaking") {
1542
+ this.agentSession._updateAgentState("listening");
1543
+ }
1532
1544
  if (toolOutput.output.length === 0) {
1533
- if (!speechHandle.interrupted) {
1534
- this.agentSession._updateAgentState("listening");
1535
- }
1536
1545
  return;
1537
1546
  }
1538
1547
  const { maxToolSteps } = this.agentSession.options;