npm - @livekit/agents - Versions diffs - 0.7.9 → 1.0.0-next.1 - Mend

@livekit/agents 0.7.9 → 1.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (627) hide show

package/dist/_exceptions.cjs +109 -0
package/dist/_exceptions.cjs.map +1 -0
package/dist/_exceptions.d.cts +64 -0
package/dist/_exceptions.d.ts +64 -0
package/dist/_exceptions.d.ts.map +1 -0
package/dist/_exceptions.js +80 -0
package/dist/_exceptions.js.map +1 -0
package/dist/audio.cjs +10 -3
package/dist/audio.cjs.map +1 -1
package/dist/audio.d.cts +2 -0
package/dist/audio.d.ts +2 -0
package/dist/audio.d.ts.map +1 -1
package/dist/audio.js +8 -2
package/dist/audio.js.map +1 -1
package/dist/cli.cjs +25 -0
package/dist/cli.cjs.map +1 -1
package/dist/cli.d.ts.map +1 -1
package/dist/cli.js +25 -0
package/dist/cli.js.map +1 -1
package/dist/constants.cjs +6 -3
package/dist/constants.cjs.map +1 -1
package/dist/constants.d.cts +2 -1
package/dist/constants.d.ts +2 -1
package/dist/constants.d.ts.map +1 -1
package/dist/constants.js +4 -2
package/dist/constants.js.map +1 -1
package/dist/http_server.cjs.map +1 -1
package/dist/http_server.d.cts +1 -0
package/dist/http_server.d.ts +1 -0
package/dist/http_server.d.ts.map +1 -1
package/dist/http_server.js.map +1 -1
package/dist/index.cjs +27 -20
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +13 -10
package/dist/index.d.ts +13 -10
package/dist/index.d.ts.map +1 -1
package/dist/index.js +15 -11
package/dist/index.js.map +1 -1
package/dist/inference_runner.cjs +0 -1
package/dist/inference_runner.cjs.map +1 -1
package/dist/inference_runner.d.cts +2 -3
package/dist/inference_runner.d.ts +2 -3
package/dist/inference_runner.d.ts.map +1 -1
package/dist/inference_runner.js +0 -1
package/dist/inference_runner.js.map +1 -1
package/dist/ipc/inference_proc_executor.cjs +2 -2
package/dist/ipc/inference_proc_executor.cjs.map +1 -1
package/dist/ipc/inference_proc_executor.js +2 -2
package/dist/ipc/inference_proc_executor.js.map +1 -1
package/dist/ipc/job_executor.cjs.map +1 -1
package/dist/ipc/job_executor.js.map +1 -1
package/dist/ipc/job_proc_executor.cjs +1 -0
package/dist/ipc/job_proc_executor.cjs.map +1 -1
package/dist/ipc/job_proc_executor.js +1 -0
package/dist/ipc/job_proc_executor.js.map +1 -1
package/dist/ipc/job_proc_lazy_main.cjs +1 -1
package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
package/dist/ipc/job_proc_lazy_main.js +1 -1
package/dist/ipc/job_proc_lazy_main.js.map +1 -1
package/dist/ipc/supervised_proc.d.cts +1 -1
package/dist/ipc/supervised_proc.d.ts +1 -1
package/dist/ipc/supervised_proc.d.ts.map +1 -1
package/dist/job.cjs +14 -2
package/dist/job.cjs.map +1 -1
package/dist/job.d.cts +8 -0
package/dist/job.d.ts +8 -0
package/dist/job.d.ts.map +1 -1
package/dist/job.js +12 -1
package/dist/job.js.map +1 -1
package/dist/llm/chat_context.cjs +332 -82
package/dist/llm/chat_context.cjs.map +1 -1
package/dist/llm/chat_context.d.cts +152 -48
package/dist/llm/chat_context.d.ts +152 -48
package/dist/llm/chat_context.d.ts.map +1 -1
package/dist/llm/chat_context.js +327 -81
package/dist/llm/chat_context.js.map +1 -1
package/dist/llm/chat_context.test.cjs +380 -0
package/dist/llm/chat_context.test.cjs.map +1 -0
package/dist/llm/chat_context.test.js +385 -0
package/dist/llm/chat_context.test.js.map +1 -0
package/dist/llm/index.cjs +37 -8
package/dist/llm/index.cjs.map +1 -1
package/dist/llm/index.d.cts +7 -3
package/dist/llm/index.d.ts +7 -3
package/dist/llm/index.d.ts.map +1 -1
package/dist/llm/index.js +39 -9
package/dist/llm/index.js.map +1 -1
package/dist/llm/llm.cjs +97 -33
package/dist/llm/llm.cjs.map +1 -1
package/dist/llm/llm.d.cts +50 -24
package/dist/llm/llm.d.ts +50 -24
package/dist/llm/llm.d.ts.map +1 -1
package/dist/llm/llm.js +98 -33
package/dist/llm/llm.js.map +1 -1
package/dist/llm/provider_format/google.cjs +128 -0
package/dist/llm/provider_format/google.cjs.map +1 -0
package/dist/llm/provider_format/google.d.cts +6 -0
package/dist/llm/provider_format/google.d.ts +6 -0
package/dist/llm/provider_format/google.d.ts.map +1 -0
package/dist/llm/provider_format/google.js +104 -0
package/dist/llm/provider_format/google.js.map +1 -0
package/dist/llm/provider_format/google.test.cjs +676 -0
package/dist/llm/provider_format/google.test.cjs.map +1 -0
package/dist/llm/provider_format/google.test.js +675 -0
package/dist/llm/provider_format/google.test.js.map +1 -0
package/dist/llm/provider_format/index.cjs +40 -0
package/dist/llm/provider_format/index.cjs.map +1 -0
package/dist/llm/provider_format/index.d.cts +4 -0
package/dist/llm/provider_format/index.d.ts +4 -0
package/dist/llm/provider_format/index.d.ts.map +1 -0
package/dist/llm/provider_format/index.js +16 -0
package/dist/llm/provider_format/index.js.map +1 -0
package/dist/llm/provider_format/openai.cjs +116 -0
package/dist/llm/provider_format/openai.cjs.map +1 -0
package/dist/llm/provider_format/openai.d.cts +3 -0
package/dist/llm/provider_format/openai.d.ts +3 -0
package/dist/llm/provider_format/openai.d.ts.map +1 -0
package/dist/llm/provider_format/openai.js +92 -0
package/dist/llm/provider_format/openai.js.map +1 -0
package/dist/llm/provider_format/openai.test.cjs +490 -0
package/dist/llm/provider_format/openai.test.cjs.map +1 -0
package/dist/llm/provider_format/openai.test.js +489 -0
package/dist/llm/provider_format/openai.test.js.map +1 -0
package/dist/llm/provider_format/utils.cjs +146 -0
package/dist/llm/provider_format/utils.cjs.map +1 -0
package/dist/llm/provider_format/utils.d.cts +38 -0
package/dist/llm/provider_format/utils.d.ts +38 -0
package/dist/llm/provider_format/utils.d.ts.map +1 -0
package/dist/llm/provider_format/utils.js +122 -0
package/dist/llm/provider_format/utils.js.map +1 -0
package/dist/llm/realtime.cjs +77 -0
package/dist/llm/realtime.cjs.map +1 -0
package/dist/llm/realtime.d.cts +98 -0
package/dist/llm/realtime.d.ts +98 -0
package/dist/llm/realtime.d.ts.map +1 -0
package/dist/llm/realtime.js +52 -0
package/dist/llm/realtime.js.map +1 -0
package/dist/llm/remote_chat_context.cjs +112 -0
package/dist/llm/remote_chat_context.cjs.map +1 -0
package/dist/llm/remote_chat_context.d.cts +23 -0
package/dist/llm/remote_chat_context.d.ts +23 -0
package/dist/llm/remote_chat_context.d.ts.map +1 -0
package/dist/llm/remote_chat_context.js +88 -0
package/dist/llm/remote_chat_context.js.map +1 -0
package/dist/llm/remote_chat_context.test.cjs +225 -0
package/dist/llm/remote_chat_context.test.cjs.map +1 -0
package/dist/llm/remote_chat_context.test.js +224 -0
package/dist/llm/remote_chat_context.test.js.map +1 -0
package/dist/llm/tool_context.cjs +111 -0
package/dist/llm/tool_context.cjs.map +1 -0
package/dist/llm/tool_context.d.cts +125 -0
package/dist/llm/tool_context.d.ts +125 -0
package/dist/llm/tool_context.d.ts.map +1 -0
package/dist/llm/tool_context.js +80 -0
package/dist/llm/tool_context.js.map +1 -0
package/dist/llm/tool_context.test.cjs +162 -0
package/dist/llm/tool_context.test.cjs.map +1 -0
package/dist/llm/tool_context.test.js +161 -0
package/dist/llm/tool_context.test.js.map +1 -0
package/dist/llm/tool_context.type.test.cjs +92 -0
package/dist/llm/tool_context.type.test.cjs.map +1 -0
package/dist/llm/tool_context.type.test.js +91 -0
package/dist/llm/tool_context.type.test.js.map +1 -0
package/dist/llm/utils.cjs +260 -0
package/dist/llm/utils.cjs.map +1 -0
package/dist/llm/utils.d.cts +42 -0
package/dist/llm/utils.d.ts +42 -0
package/dist/llm/utils.d.ts.map +1 -0
package/dist/llm/utils.js +223 -0
package/dist/llm/utils.js.map +1 -0
package/dist/llm/utils.test.cjs +513 -0
package/dist/llm/utils.test.cjs.map +1 -0
package/dist/llm/utils.test.js +490 -0
package/dist/llm/utils.test.js.map +1 -0
package/dist/metrics/base.cjs +0 -27
package/dist/metrics/base.cjs.map +1 -1
package/dist/metrics/base.d.cts +105 -63
package/dist/metrics/base.d.ts +105 -63
package/dist/metrics/base.d.ts.map +1 -1
package/dist/metrics/base.js +0 -19
package/dist/metrics/base.js.map +1 -1
package/dist/metrics/index.cjs +0 -3
package/dist/metrics/index.cjs.map +1 -1
package/dist/metrics/index.d.cts +2 -3
package/dist/metrics/index.d.ts +2 -3
package/dist/metrics/index.d.ts.map +1 -1
package/dist/metrics/index.js +0 -2
package/dist/metrics/index.js.map +1 -1
package/dist/metrics/usage_collector.cjs +17 -12
package/dist/metrics/usage_collector.cjs.map +1 -1
package/dist/metrics/usage_collector.d.cts +3 -2
package/dist/metrics/usage_collector.d.ts +3 -2
package/dist/metrics/usage_collector.d.ts.map +1 -1
package/dist/metrics/usage_collector.js +17 -12
package/dist/metrics/usage_collector.js.map +1 -1
package/dist/metrics/utils.cjs +22 -59
package/dist/metrics/utils.cjs.map +1 -1
package/dist/metrics/utils.d.cts +1 -8
package/dist/metrics/utils.d.ts +1 -8
package/dist/metrics/utils.d.ts.map +1 -1
package/dist/metrics/utils.js +22 -52
package/dist/metrics/utils.js.map +1 -1
package/dist/multimodal/index.cjs +0 -2
package/dist/multimodal/index.cjs.map +1 -1
package/dist/multimodal/index.d.cts +0 -1
package/dist/multimodal/index.d.ts +0 -1
package/dist/multimodal/index.d.ts.map +1 -1
package/dist/multimodal/index.js +0 -1
package/dist/multimodal/index.js.map +1 -1
package/dist/plugin.cjs +24 -8
package/dist/plugin.cjs.map +1 -1
package/dist/plugin.d.cts +18 -4
package/dist/plugin.d.ts +18 -4
package/dist/plugin.d.ts.map +1 -1
package/dist/plugin.js +22 -7
package/dist/plugin.js.map +1 -1
package/dist/stream/deferred_stream.cjs +98 -0
package/dist/stream/deferred_stream.cjs.map +1 -0
package/dist/stream/deferred_stream.d.cts +27 -0
package/dist/stream/deferred_stream.d.ts +27 -0
package/dist/stream/deferred_stream.d.ts.map +1 -0
package/dist/stream/deferred_stream.js +73 -0
package/dist/stream/deferred_stream.js.map +1 -0
package/dist/stream/deferred_stream.test.cjs +527 -0
package/dist/stream/deferred_stream.test.cjs.map +1 -0
package/dist/stream/deferred_stream.test.js +526 -0
package/dist/stream/deferred_stream.test.js.map +1 -0
package/dist/stream/identity_transform.cjs +42 -0
package/dist/stream/identity_transform.cjs.map +1 -0
package/dist/stream/identity_transform.d.cts +6 -0
package/dist/stream/identity_transform.d.ts +6 -0
package/dist/stream/identity_transform.d.ts.map +1 -0
package/dist/stream/identity_transform.js +18 -0
package/dist/stream/identity_transform.js.map +1 -0
package/dist/stream/identity_transform.test.cjs +125 -0
package/dist/stream/identity_transform.test.cjs.map +1 -0
package/dist/stream/identity_transform.test.js +124 -0
package/dist/stream/identity_transform.test.js.map +1 -0
package/dist/stream/index.cjs +38 -0
package/dist/stream/index.cjs.map +1 -0
package/dist/stream/index.d.cts +5 -0
package/dist/stream/index.d.ts +5 -0
package/dist/stream/index.d.ts.map +1 -0
package/dist/stream/index.js +11 -0
package/dist/stream/index.js.map +1 -0
package/dist/stream/merge_readable_streams.cjs +59 -0
package/dist/stream/merge_readable_streams.cjs.map +1 -0
package/dist/stream/merge_readable_streams.d.cts +4 -0
package/dist/stream/merge_readable_streams.d.ts +4 -0
package/dist/stream/merge_readable_streams.d.ts.map +1 -0
package/dist/stream/merge_readable_streams.js +35 -0
package/dist/stream/merge_readable_streams.js.map +1 -0
package/dist/stream/stream_channel.cjs +47 -0
package/dist/stream/stream_channel.cjs.map +1 -0
package/dist/stream/stream_channel.d.cts +9 -0
package/dist/stream/stream_channel.d.ts +9 -0
package/dist/stream/stream_channel.d.ts.map +1 -0
package/dist/stream/stream_channel.js +23 -0
package/dist/stream/stream_channel.js.map +1 -0
package/dist/stream/stream_channel.test.cjs +97 -0
package/dist/stream/stream_channel.test.cjs.map +1 -0
package/dist/stream/stream_channel.test.js +96 -0
package/dist/stream/stream_channel.test.js.map +1 -0
package/dist/stt/stream_adapter.cjs +3 -4
package/dist/stt/stream_adapter.cjs.map +1 -1
package/dist/stt/stream_adapter.d.cts +1 -0
package/dist/stt/stream_adapter.d.ts +1 -0
package/dist/stt/stream_adapter.d.ts.map +1 -1
package/dist/stt/stream_adapter.js +3 -4
package/dist/stt/stream_adapter.js.map +1 -1
package/dist/stt/stt.cjs +100 -10
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +26 -5
package/dist/stt/stt.d.ts +26 -5
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +101 -11
package/dist/stt/stt.js.map +1 -1
package/dist/tokenize/basic/basic.cjs +10 -5
package/dist/tokenize/basic/basic.cjs.map +1 -1
package/dist/tokenize/basic/basic.d.cts +7 -1
package/dist/tokenize/basic/basic.d.ts +7 -1
package/dist/tokenize/basic/basic.d.ts.map +1 -1
package/dist/tokenize/basic/basic.js +10 -5
package/dist/tokenize/basic/basic.js.map +1 -1
package/dist/tokenize/basic/sentence.cjs +14 -6
package/dist/tokenize/basic/sentence.cjs.map +1 -1
package/dist/tokenize/basic/sentence.d.cts +1 -1
package/dist/tokenize/basic/sentence.d.ts +1 -1
package/dist/tokenize/basic/sentence.d.ts.map +1 -1
package/dist/tokenize/basic/sentence.js +14 -6
package/dist/tokenize/basic/sentence.js.map +1 -1
package/dist/tokenize/token_stream.cjs +5 -3
package/dist/tokenize/token_stream.cjs.map +1 -1
package/dist/tokenize/token_stream.d.cts +1 -0
package/dist/tokenize/token_stream.d.ts +1 -0
package/dist/tokenize/token_stream.d.ts.map +1 -1
package/dist/tokenize/token_stream.js +6 -4
package/dist/tokenize/token_stream.js.map +1 -1
package/dist/transcription.cjs +1 -2
package/dist/transcription.cjs.map +1 -1
package/dist/transcription.d.ts.map +1 -1
package/dist/transcription.js +2 -3
package/dist/transcription.js.map +1 -1
package/dist/tts/index.cjs +2 -4
package/dist/tts/index.cjs.map +1 -1
package/dist/tts/index.d.cts +1 -1
package/dist/tts/index.d.ts +1 -1
package/dist/tts/index.d.ts.map +1 -1
package/dist/tts/index.js +1 -3
package/dist/tts/index.js.map +1 -1
package/dist/tts/stream_adapter.cjs +26 -13
package/dist/tts/stream_adapter.cjs.map +1 -1
package/dist/tts/stream_adapter.d.cts +1 -1
package/dist/tts/stream_adapter.d.ts +1 -1
package/dist/tts/stream_adapter.d.ts.map +1 -1
package/dist/tts/stream_adapter.js +27 -14
package/dist/tts/stream_adapter.js.map +1 -1
package/dist/tts/tts.cjs +156 -25
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.d.cts +29 -5
package/dist/tts/tts.d.ts +29 -5
package/dist/tts/tts.d.ts.map +1 -1
package/dist/tts/tts.js +156 -24
package/dist/tts/tts.js.map +1 -1
package/dist/types.cjs +60 -0
package/dist/types.cjs.map +1 -0
package/dist/types.d.cts +13 -0
package/dist/types.d.ts +13 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +35 -0
package/dist/types.js.map +1 -0
package/dist/utils.cjs +298 -27
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +145 -9
package/dist/utils.d.ts +145 -9
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +281 -26
package/dist/utils.js.map +1 -1
package/dist/utils.test.cjs +491 -0
package/dist/utils.test.cjs.map +1 -0
package/dist/utils.test.js +498 -0
package/dist/utils.test.js.map +1 -0
package/dist/vad.cjs +76 -20
package/dist/vad.cjs.map +1 -1
package/dist/vad.d.cts +25 -5
package/dist/vad.d.ts +25 -5
package/dist/vad.d.ts.map +1 -1
package/dist/vad.js +76 -20
package/dist/vad.js.map +1 -1
package/dist/voice/agent.cjs +245 -0
package/dist/voice/agent.cjs.map +1 -0
package/dist/voice/agent.d.cts +78 -0
package/dist/voice/agent.d.ts +78 -0
package/dist/voice/agent.d.ts.map +1 -0
package/dist/voice/agent.js +220 -0
package/dist/voice/agent.js.map +1 -0
package/dist/voice/agent.test.cjs +61 -0
package/dist/voice/agent.test.cjs.map +1 -0
package/dist/voice/agent.test.js +60 -0
package/dist/voice/agent.test.js.map +1 -0
package/dist/voice/agent_activity.cjs +1453 -0
package/dist/voice/agent_activity.cjs.map +1 -0
package/dist/voice/agent_activity.d.cts +94 -0
package/dist/voice/agent_activity.d.ts +94 -0
package/dist/voice/agent_activity.d.ts.map +1 -0
package/dist/voice/agent_activity.js +1449 -0
package/dist/voice/agent_activity.js.map +1 -0
package/dist/voice/agent_session.cjs +312 -0
package/dist/voice/agent_session.cjs.map +1 -0
package/dist/voice/agent_session.d.cts +121 -0
package/dist/voice/agent_session.d.ts +121 -0
package/dist/voice/agent_session.d.ts.map +1 -0
package/dist/voice/agent_session.js +295 -0
package/dist/voice/agent_session.js.map +1 -0
package/dist/voice/audio_recognition.cjs +374 -0
package/dist/voice/audio_recognition.cjs.map +1 -0
package/dist/voice/audio_recognition.d.cts +80 -0
package/dist/voice/audio_recognition.d.ts +80 -0
package/dist/voice/audio_recognition.d.ts.map +1 -0
package/dist/voice/audio_recognition.js +350 -0
package/dist/voice/audio_recognition.js.map +1 -0
package/dist/voice/events.cjs +145 -0
package/dist/voice/events.cjs.map +1 -0
package/dist/voice/events.d.cts +124 -0
package/dist/voice/events.d.ts +124 -0
package/dist/voice/events.d.ts.map +1 -0
package/dist/voice/events.js +110 -0
package/dist/voice/events.js.map +1 -0
package/dist/voice/generation.cjs +700 -0
package/dist/voice/generation.cjs.map +1 -0
package/dist/voice/generation.d.cts +115 -0
package/dist/voice/generation.d.ts +115 -0
package/dist/voice/generation.d.ts.map +1 -0
package/dist/voice/generation.js +672 -0
package/dist/voice/generation.js.map +1 -0
package/dist/voice/index.cjs +40 -0
package/dist/voice/index.cjs.map +1 -0
package/dist/voice/index.d.cts +5 -0
package/dist/voice/index.d.ts +5 -0
package/dist/voice/index.d.ts.map +1 -0
package/dist/voice/index.js +11 -0
package/dist/voice/index.js.map +1 -0
package/dist/voice/io.cjs +245 -0
package/dist/voice/io.cjs.map +1 -0
package/dist/voice/io.d.cts +101 -0
package/dist/voice/io.d.ts +101 -0
package/dist/voice/io.d.ts.map +1 -0
package/dist/voice/io.js +217 -0
package/dist/voice/io.js.map +1 -0
package/dist/voice/room_io/_input.cjs +121 -0
package/dist/voice/room_io/_input.cjs.map +1 -0
package/dist/voice/room_io/_input.d.cts +24 -0
package/dist/voice/room_io/_input.d.ts +24 -0
package/dist/voice/room_io/_input.d.ts.map +1 -0
package/dist/voice/room_io/_input.js +102 -0
package/dist/voice/room_io/_input.js.map +1 -0
package/dist/voice/room_io/_output.cjs +358 -0
package/dist/voice/room_io/_output.cjs.map +1 -0
package/dist/voice/room_io/_output.d.cts +75 -0
package/dist/voice/room_io/_output.d.ts +75 -0
package/dist/voice/room_io/_output.d.ts.map +1 -0
package/dist/voice/room_io/_output.js +342 -0
package/dist/voice/room_io/_output.js.map +1 -0
package/dist/voice/room_io/index.cjs +25 -0
package/dist/voice/room_io/index.cjs.map +1 -0
package/dist/voice/room_io/index.d.cts +3 -0
package/dist/voice/room_io/index.d.ts +3 -0
package/dist/voice/room_io/index.d.ts.map +1 -0
package/dist/voice/room_io/index.js +3 -0
package/dist/voice/room_io/index.js.map +1 -0
package/dist/voice/room_io/room_io.cjs +370 -0
package/dist/voice/room_io/room_io.cjs.map +1 -0
package/dist/voice/room_io/room_io.d.cts +73 -0
package/dist/voice/room_io/room_io.d.ts +73 -0
package/dist/voice/room_io/room_io.d.ts.map +1 -0
package/dist/voice/room_io/room_io.js +361 -0
package/dist/voice/room_io/room_io.js.map +1 -0
package/dist/{pipeline/index.cjs → voice/run_context.cjs} +16 -11
package/dist/voice/run_context.cjs.map +1 -0
package/dist/voice/run_context.d.cts +12 -0
package/dist/voice/run_context.d.ts +12 -0
package/dist/voice/run_context.d.ts.map +1 -0
package/dist/voice/run_context.js +14 -0
package/dist/voice/run_context.js.map +1 -0
package/dist/voice/speech_handle.cjs +105 -0
package/dist/voice/speech_handle.cjs.map +1 -0
package/dist/voice/speech_handle.d.cts +46 -0
package/dist/voice/speech_handle.d.ts +46 -0
package/dist/voice/speech_handle.d.ts.map +1 -0
package/dist/voice/speech_handle.js +81 -0
package/dist/voice/speech_handle.js.map +1 -0
package/dist/voice/transcription/_utils.cjs +45 -0
package/dist/voice/transcription/_utils.cjs.map +1 -0
package/dist/voice/transcription/_utils.d.cts +3 -0
package/dist/voice/transcription/_utils.d.ts +3 -0
package/dist/voice/transcription/_utils.d.ts.map +1 -0
package/dist/voice/transcription/_utils.js +21 -0
package/dist/voice/transcription/_utils.js.map +1 -0
package/dist/voice/transcription/index.cjs +23 -0
package/dist/voice/transcription/index.cjs.map +1 -0
package/dist/voice/transcription/index.d.cts +2 -0
package/dist/voice/transcription/index.d.ts +2 -0
package/dist/voice/transcription/index.d.ts.map +1 -0
package/dist/voice/transcription/index.js +2 -0
package/dist/voice/transcription/index.js.map +1 -0
package/dist/voice/transcription/synchronizer.cjs +379 -0
package/dist/voice/transcription/synchronizer.cjs.map +1 -0
package/dist/voice/transcription/synchronizer.d.cts +86 -0
package/dist/voice/transcription/synchronizer.d.ts +86 -0
package/dist/voice/transcription/synchronizer.d.ts.map +1 -0
package/dist/voice/transcription/synchronizer.js +354 -0
package/dist/voice/transcription/synchronizer.js.map +1 -0
package/dist/worker.cjs +22 -4
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.cts +1 -1
package/dist/worker.d.ts +1 -1
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +22 -4
package/dist/worker.js.map +1 -1
package/package.json +8 -2
package/src/_exceptions.ts +137 -0
package/src/audio.ts +12 -1
package/src/cli.ts +37 -0
package/src/constants.ts +2 -1
package/src/http_server.ts +1 -0
package/src/index.ts +13 -10
package/src/inference_runner.ts +2 -3
package/src/ipc/inference_proc_executor.ts +2 -2
package/src/ipc/job_executor.ts +1 -1
package/src/ipc/job_proc_executor.ts +1 -1
package/src/ipc/job_proc_lazy_main.ts +1 -1
package/src/job.ts +18 -0
package/src/llm/__snapshots__/chat_context.test.ts.snap +527 -0
package/src/llm/__snapshots__/tool_context.test.ts.snap +177 -0
package/src/llm/__snapshots__/utils.test.ts.snap +65 -0
package/src/llm/chat_context.test.ts +450 -0
package/src/llm/chat_context.ts +501 -103
package/src/llm/index.ts +53 -18
package/src/llm/llm.ts +148 -50
package/src/llm/provider_format/google.test.ts +772 -0
package/src/llm/provider_format/google.ts +130 -0
package/src/llm/provider_format/index.ts +23 -0
package/src/llm/provider_format/openai.test.ts +581 -0
package/src/llm/provider_format/openai.ts +118 -0
package/src/llm/provider_format/utils.ts +183 -0
package/src/llm/realtime.ts +151 -0
package/src/llm/remote_chat_context.test.ts +290 -0
package/src/llm/remote_chat_context.ts +114 -0
package/src/llm/tool_context.test.ts +198 -0
package/src/llm/tool_context.ts +259 -0
package/src/llm/tool_context.type.test.ts +115 -0
package/src/llm/utils.test.ts +670 -0
package/src/llm/utils.ts +324 -0
package/src/metrics/base.ts +110 -78
package/src/metrics/index.ts +3 -9
package/src/metrics/usage_collector.ts +19 -13
package/src/metrics/utils.ts +24 -69
package/src/multimodal/index.ts +0 -1
package/src/plugin.ts +26 -8
package/src/stream/deferred_stream.test.ts +755 -0
package/src/stream/deferred_stream.ts +110 -0
package/src/stream/identity_transform.test.ts +179 -0
package/src/stream/identity_transform.ts +18 -0
package/src/stream/index.ts +7 -0
package/src/stream/merge_readable_streams.ts +40 -0
package/src/stream/stream_channel.test.ts +129 -0
package/src/stream/stream_channel.ts +32 -0
package/src/stt/stream_adapter.ts +3 -5
package/src/stt/stt.ts +134 -17
package/src/tokenize/basic/basic.ts +13 -5
package/src/tokenize/basic/sentence.ts +20 -6
package/src/tokenize/token_stream.ts +7 -4
package/src/transcription.ts +2 -3
package/src/tts/index.ts +0 -1
package/src/tts/stream_adapter.ts +42 -16
package/src/tts/tts.ts +202 -21
package/src/types.ts +42 -0
package/src/utils.test.ts +658 -0
package/src/utils.ts +402 -44
package/src/vad.ts +90 -22
package/src/voice/agent.test.ts +80 -0
package/src/voice/agent.ts +332 -0
package/src/voice/agent_activity.ts +1913 -0
package/src/voice/agent_session.ts +460 -0
package/src/voice/audio_recognition.ts +473 -0
package/src/voice/events.ts +252 -0
package/src/voice/generation.ts +881 -0
package/src/voice/index.ts +7 -0
package/src/voice/io.ts +304 -0
package/src/voice/room_io/_input.ts +144 -0
package/src/voice/room_io/_output.ts +436 -0
package/src/voice/room_io/index.ts +5 -0
package/src/voice/room_io/room_io.ts +495 -0
package/src/voice/run_context.ts +20 -0
package/src/voice/speech_handle.ts +104 -0
package/src/voice/transcription/_utils.ts +25 -0
package/src/voice/transcription/index.ts +4 -0
package/src/voice/transcription/synchronizer.ts +477 -0
package/src/worker.ts +22 -2
package/dist/llm/function_context.cjs +0 -103
package/dist/llm/function_context.cjs.map +0 -1
package/dist/llm/function_context.d.cts +0 -47
package/dist/llm/function_context.d.ts +0 -47
package/dist/llm/function_context.d.ts.map +0 -1
package/dist/llm/function_context.js +0 -78
package/dist/llm/function_context.js.map +0 -1
package/dist/llm/function_context.test.cjs +0 -218
package/dist/llm/function_context.test.cjs.map +0 -1
package/dist/llm/function_context.test.js +0 -217
package/dist/llm/function_context.test.js.map +0 -1
package/dist/multimodal/multimodal_agent.cjs +0 -486
package/dist/multimodal/multimodal_agent.cjs.map +0 -1
package/dist/multimodal/multimodal_agent.d.cts +0 -48
package/dist/multimodal/multimodal_agent.d.ts +0 -48
package/dist/multimodal/multimodal_agent.d.ts.map +0 -1
package/dist/multimodal/multimodal_agent.js +0 -461
package/dist/multimodal/multimodal_agent.js.map +0 -1
package/dist/pipeline/agent_output.cjs +0 -197
package/dist/pipeline/agent_output.cjs.map +0 -1
package/dist/pipeline/agent_output.d.cts +0 -33
package/dist/pipeline/agent_output.d.ts +0 -33
package/dist/pipeline/agent_output.d.ts.map +0 -1
package/dist/pipeline/agent_output.js +0 -172
package/dist/pipeline/agent_output.js.map +0 -1
package/dist/pipeline/agent_playout.cjs +0 -175
package/dist/pipeline/agent_playout.cjs.map +0 -1
package/dist/pipeline/agent_playout.d.cts +0 -40
package/dist/pipeline/agent_playout.d.ts +0 -40
package/dist/pipeline/agent_playout.d.ts.map +0 -1
package/dist/pipeline/agent_playout.js +0 -139
package/dist/pipeline/agent_playout.js.map +0 -1
package/dist/pipeline/human_input.cjs +0 -171
package/dist/pipeline/human_input.cjs.map +0 -1
package/dist/pipeline/human_input.d.cts +0 -30
package/dist/pipeline/human_input.d.ts +0 -30
package/dist/pipeline/human_input.d.ts.map +0 -1
package/dist/pipeline/human_input.js +0 -146
package/dist/pipeline/human_input.js.map +0 -1
package/dist/pipeline/index.cjs.map +0 -1
package/dist/pipeline/index.d.cts +0 -2
package/dist/pipeline/index.d.ts +0 -2
package/dist/pipeline/index.d.ts.map +0 -1
package/dist/pipeline/index.js +0 -11
package/dist/pipeline/index.js.map +0 -1
package/dist/pipeline/pipeline_agent.cjs +0 -859
package/dist/pipeline/pipeline_agent.cjs.map +0 -1
package/dist/pipeline/pipeline_agent.d.cts +0 -150
package/dist/pipeline/pipeline_agent.d.ts +0 -150
package/dist/pipeline/pipeline_agent.d.ts.map +0 -1
package/dist/pipeline/pipeline_agent.js +0 -837
package/dist/pipeline/pipeline_agent.js.map +0 -1
package/dist/pipeline/speech_handle.cjs +0 -176
package/dist/pipeline/speech_handle.cjs.map +0 -1
package/dist/pipeline/speech_handle.d.cts +0 -37
package/dist/pipeline/speech_handle.d.ts +0 -37
package/dist/pipeline/speech_handle.d.ts.map +0 -1
package/dist/pipeline/speech_handle.js +0 -152
package/dist/pipeline/speech_handle.js.map +0 -1
package/src/llm/function_context.test.ts +0 -248
package/src/llm/function_context.ts +0 -142
package/src/multimodal/multimodal_agent.ts +0 -592
package/src/pipeline/agent_output.ts +0 -219
package/src/pipeline/agent_playout.ts +0 -192
package/src/pipeline/human_input.ts +0 -188
package/src/pipeline/index.ts +0 -15
package/src/pipeline/pipeline_agent.ts +0 -1197
package/src/pipeline/speech_handle.ts +0 -201

package/src/voice/agent_activity.ts ADDED Viewed

@@ -0,0 +1,1913 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { Mutex } from '@livekit/mutex';
+import type { AudioFrame } from '@livekit/rtc-node';
+import { Heap } from 'heap-js';
+import { AsyncLocalStorage } from 'node:async_hooks';
+import { ReadableStream } from 'node:stream/web';
+import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
+import {
+  type ChatItem,
+  type FunctionCall,
+  type GenerationCreatedEvent,
+  type InputSpeechStartedEvent,
+  type InputSpeechStoppedEvent,
+  type InputTranscriptionCompleted,
+  LLM,
+  RealtimeModel,
+  type RealtimeModelError,
+  type RealtimeSession,
+  type ToolChoice,
+  type ToolContext,
+} from '../llm/index.js';
+import type { LLMError } from '../llm/llm.js';
+import { log } from '../log.js';
+import type {
+  EOUMetrics,
+  LLMMetrics,
+  RealtimeModelMetrics,
+  STTMetrics,
+  TTSMetrics,
+  VADMetrics,
+} from '../metrics/base.js';
+import { DeferredReadableStream } from '../stream/deferred_stream.js';
+import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
+import { splitWords } from '../tokenize/basic/word.js';
+import { TTS, type TTSError } from '../tts/tts.js';
+import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
+import { VAD, type VADEvent } from '../vad.js';
+import type { Agent, ModelSettings } from './agent.js';
+import { StopResponse, asyncLocalStorage } from './agent.js';
+import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
+import {
+  AudioRecognition,
+  type EndOfTurnInfo,
+  type RecognitionHooks,
+  type _TurnDetector,
+} from './audio_recognition.js';
+import {
+  AgentSessionEventTypes,
+  createErrorEvent,
+  createFunctionToolsExecutedEvent,
+  createMetricsCollectedEvent,
+  createSpeechCreatedEvent,
+  createUserInputTranscribedEvent,
+} from './events.js';
+import type { ToolExecutionOutput } from './generation.js';
+import {
+  type _AudioOut,
+  type _TextOut,
+  performAudioForwarding,
+  performLLMInference,
+  performTTSInference,
+  performTextForwarding,
+  performToolExecutions,
+  removeInstructions,
+  updateInstructions,
+} from './generation.js';
+import { SpeechHandle } from './speech_handle.js';
+// equivalent to Python's contextvars
+const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
+export class AgentActivity implements RecognitionHooks {
+  private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
+  private started = false;
+  private audioRecognition?: AudioRecognition;
+  private realtimeSession?: RealtimeSession;
+  private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
+  private logger = log();
+  private _draining = false;
+  private _currentSpeech?: SpeechHandle;
+  private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
+  private q_updated: Future;
+  private speechTasks: Set<Promise<unknown>> = new Set();
+  private lock = new Mutex();
+  private audioStream = new DeferredReadableStream<AudioFrame>();
+  // default to null as None, which maps to the default provider tool choice value
+  private toolChoice: ToolChoice | null = null;
+  agent: Agent;
+  agentSession: AgentSession;
+  /** @internal */
+  _mainTask?: Task<void>;
+  _userTurnCompletedTask?: Promise<void>;
+  constructor(agent: Agent, agentSession: AgentSession) {
+    this.agent = agent;
+    this.agentSession = agentSession;
+    /**
+     * Custom comparator to prioritize speech handles with higher priority
+     * - Prefer higher priority
+     * - Prefer earlier timestamp (so calling a sequence of generateReply() will execute in FIFO order)
+     */
+    this.speechQueue = new Heap<[number, number, SpeechHandle]>(([p1, t1, _], [p2, t2, __]) => {
+      return p1 === p2 ? t1 - t2 : p2 - p1;
+    });
+    this.q_updated = new Future();
+    this.turnDetectionMode =
+      typeof this.turnDetection === 'string' ? this.turnDetection : undefined;
+    if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
+      this.logger.warn(
+        'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting',
+      );
+      this.turnDetectionMode = undefined;
+    }
+    if (this.turnDetectionMode === 'stt' && this.stt === undefined) {
+      this.logger.warn(
+        'turnDetection is set to "stt", but no STT model is provided, ignoring the turnDetection setting',
+      );
+      this.turnDetectionMode = undefined;
+    }
+    if (this.llm instanceof RealtimeModel) {
+      if (this.llm.capabilities.turnDetection && !this.allowInterruptions) {
+        this.logger.warn(
+          'the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false, ' +
+            'disable turnDetection in the RealtimeModel and use VAD on the AgentSession instead',
+        );
+      }
+      if (this.turnDetectionMode === 'realtime_llm' && !this.llm.capabilities.turnDetection) {
+        this.logger.warn(
+          'turnDetection is set to "realtime_llm", but the LLM is not a RealtimeModel or the server-side turn detection is not supported/enabled, ignoring the turnDetection setting',
+        );
+        this.turnDetectionMode = undefined;
+      }
+      if (this.turnDetectionMode === 'stt') {
+        this.logger.warn(
+          'turnDetection is set to "stt", but the LLM is a RealtimeModel, ignoring the turnDetection setting',
+        );
+        this.turnDetectionMode = undefined;
+      }
+      if (
+        this.turnDetectionMode &&
+        this.turnDetectionMode !== 'realtime_llm' &&
+        this.llm.capabilities.turnDetection
+      ) {
+        this.logger.warn(
+          `turnDetection is set to "${this.turnDetectionMode}", but the LLM is a RealtimeModel and server-side turn detection enabled, ignoring the turnDetection setting`,
+        );
+        this.turnDetectionMode = undefined;
+      }
+      // fallback to VAD if server side turn detection is disabled and VAD is available
+      if (
+        !this.llm.capabilities.turnDetection &&
+        this.vad &&
+        this.turnDetectionMode === undefined
+      ) {
+        this.turnDetectionMode = 'vad';
+      }
+    } else if (this.turnDetectionMode === 'realtime_llm') {
+      this.logger.warn(
+        'turnDetection is set to "realtime_llm", but the LLM is not a RealtimeModel',
+      );
+      this.turnDetectionMode = undefined;
+    }
+    if (
+      !this.vad &&
+      this.stt &&
+      this.llm instanceof LLM &&
+      this.allowInterruptions &&
+      this.turnDetectionMode === undefined
+    ) {
+      this.logger.warn(
+        'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
+          'for more responsive interruption handling.',
+      );
+    }
+  }
+  async start(): Promise<void> {
+    const unlock = await this.lock.lock();
+    try {
+      this.agent._agentActivity = this;
+      if (this.llm instanceof RealtimeModel) {
+        this.realtimeSession = this.llm.session();
+        this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev));
+        this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
+        this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
+        this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
+          this.onInputAudioTranscriptionCompleted(ev),
+        );
+        this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
+        this.realtimeSession.on('error', (ev) => this.onError(ev));
+        removeInstructions(this.agent._chatCtx);
+        try {
+          await this.realtimeSession.updateInstructions(this.agent.instructions);
+        } catch (error) {
+          this.logger.error(error, 'failed to update the instructions');
+        }
+        try {
+          await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
+        } catch (error) {
+          this.logger.error(error, 'failed to update the chat context');
+        }
+        try {
+          await this.realtimeSession.updateTools(this.tools);
+        } catch (error) {
+          this.logger.error(error, 'failed to update the tools');
+        }
+      } else if (this.llm instanceof LLM) {
+        try {
+          updateInstructions({
+            chatCtx: this.agent._chatCtx,
+            instructions: this.agent.instructions,
+            addIfMissing: true,
+          });
+        } catch (error) {
+          this.logger.error('failed to update the instructions', error);
+        }
+      }
+      // metrics and error handling
+      if (this.llm instanceof LLM) {
+        this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
+        this.llm.on('error', (ev) => this.onError(ev));
+      }
+      if (this.stt instanceof STT) {
+        this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
+        this.stt.on('error', (ev) => this.onError(ev));
+      }
+      if (this.tts instanceof TTS) {
+        this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
+        this.tts.on('error', (ev) => this.onError(ev));
+      }
+      if (this.vad instanceof VAD) {
+        this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
+      }
+      this.audioRecognition = new AudioRecognition({
+        recognitionHooks: this,
+        // Disable stt node if stt is not provided
+        stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
+        vad: this.vad,
+        turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
+        turnDetectionMode: this.turnDetectionMode,
+        minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
+        maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
+      });
+      this.audioRecognition.start();
+      this.started = true;
+      this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
+      this.createSpeechTask({
+        promise: this.agent.onEnter(),
+        name: 'AgentActivity_onEnter',
+      });
+    } finally {
+      unlock();
+    }
+  }
+  get currentSpeech(): SpeechHandle | undefined {
+    return this._currentSpeech;
+  }
+  get vad(): VAD | undefined {
+    return this.agent.vad || this.agentSession.vad;
+  }
+  get stt(): STT | undefined {
+    return this.agent.stt || this.agentSession.stt;
+  }
+  get llm(): LLM | RealtimeModel | undefined {
+    return this.agent.llm || this.agentSession.llm;
+  }
+  get tts(): TTS | undefined {
+    return this.agent.tts || this.agentSession.tts;
+  }
+  get tools(): ToolContext {
+    return this.agent.toolCtx;
+  }
+  get draining(): boolean {
+    return this._draining;
+  }
+  get realtimeLLMSession(): RealtimeSession | undefined {
+    return this.realtimeSession;
+  }
+  get allowInterruptions(): boolean {
+    // TODO(AJS-51): Allow options to be defined in Agent class
+    return this.agentSession.options.allowInterruptions;
+  }
+  get turnDetection(): TurnDetectionMode | undefined {
+    // TODO(brian): prioritize using agent.turn_detection
+    return this.agentSession.turnDetection;
+  }
+  get toolCtx(): ToolContext {
+    return this.agent.toolCtx;
+  }
+  async updateChatCtx(chatCtx: ChatContext): Promise<void> {
+    chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
+    this.agent._chatCtx = chatCtx;
+    if (this.realtimeSession) {
+      removeInstructions(chatCtx);
+      this.realtimeSession.updateChatCtx(chatCtx);
+    } else {
+      updateInstructions({
+        chatCtx,
+        instructions: this.agent.instructions,
+        addIfMissing: true,
+      });
+    }
+  }
+  updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
+    if (toolChoice !== undefined) {
+      this.toolChoice = toolChoice;
+    }
+    if (this.realtimeSession) {
+      this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
+    }
+  }
+  attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
+    if (this.audioStream.isSourceSet) {
+      this.logger.debug('detaching existing audio input in agent activity');
+      this.audioStream.detachSource();
+    }
+    /**
+     * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
+     * The tee() operation should be applied to the deferred stream, not the original audioStream.
+     * This is important because teeing the original stream directly makes it very difficult—if not
+     * impossible—to implement stream unlock logic cleanly.
+     */
+    this.audioStream.setSource(audioStream);
+    const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
+    if (this.realtimeSession) {
+      this.realtimeSession.setInputAudioStream(realtimeAudioStream);
+    }
+    if (this.audioRecognition) {
+      this.audioRecognition.setInputAudioStream(recognitionAudioStream);
+    }
+  }
+  detachAudioInput(): void {
+    this.audioStream.detachSource();
+  }
+  commitUserTurn() {
+    if (!this.audioRecognition) {
+      throw new Error('AudioRecognition is not initialized');
+    }
+    // TODO(brian): add audio_detached flag
+    const audioDetached = false;
+    this.audioRecognition.commitUserTurn(audioDetached);
+  }
+  clearUserTurn() {
+    this.audioRecognition?.clearUserTurn();
+    this.realtimeSession?.clearAudio();
+  }
+  say(
+    text: string | ReadableStream<string>,
+    options?: {
+      audio?: ReadableStream<AudioFrame>;
+      allowInterruptions?: boolean;
+      addToChatCtx?: boolean;
+    },
+  ): SpeechHandle {
+    const {
+      audio,
+      allowInterruptions: defaultAllowInterruptions,
+      addToChatCtx = true,
+    } = options ?? {};
+    let allowInterruptions = defaultAllowInterruptions;
+    if (
+      !audio &&
+      !this.tts &&
+      this.agentSession.output.audio &&
+      this.agentSession.output.audioEnabled
+    ) {
+      throw new Error('trying to generate speech from text without a TTS model');
+    }
+    if (
+      this.llm instanceof RealtimeModel &&
+      this.llm.capabilities.turnDetection &&
+      allowInterruptions === false
+    ) {
+      this.logger.warn(
+        'the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false when using VoiceAgent.say(), ' +
+          'disable turnDetection in the RealtimeModel and use VAD on the AgentTask/VoiceAgent instead',
+      );
+      allowInterruptions = true;
+    }
+    const handle = SpeechHandle.create({
+      allowInterruptions: allowInterruptions ?? this.allowInterruptions,
+    });
+    this.agentSession.emit(
+      AgentSessionEventTypes.SpeechCreated,
+      createSpeechCreatedEvent({
+        userInitiated: true,
+        source: 'say',
+        speechHandle: handle,
+      }),
+    );
+    const task = this.createSpeechTask({
+      promise: this.ttsTask(handle, text, addToChatCtx, {}, audio),
+      ownedSpeechHandle: handle,
+      name: 'AgentActivity.say_tts',
+    });
+    task.finally(() => this.onPipelineReplyDone());
+    this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    return handle;
+  }
+  // -- Metrics and errors --
+  private onMetricsCollected = (
+    ev: STTMetrics | TTSMetrics | VADMetrics | LLMMetrics | RealtimeModelMetrics,
+  ) => {
+    const speechHandle = speechHandleStorage.getStore();
+    if (speechHandle && (ev.type === 'llm_metrics' || ev.type === 'tts_metrics')) {
+      ev.speechId = speechHandle.id;
+    }
+    this.agentSession.emit(
+      AgentSessionEventTypes.MetricsCollected,
+      createMetricsCollectedEvent({ metrics: ev }),
+    );
+  };
+  private onError(ev: RealtimeModelError | STTError | TTSError | LLMError): void {
+    if (ev.type === 'realtime_model_error') {
+      const errorEvent = createErrorEvent(ev.error, this.llm);
+      this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
+    } else if (ev.type === 'stt_error') {
+      const errorEvent = createErrorEvent(ev.error, this.stt);
+      this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
+    } else if (ev.type === 'tts_error') {
+      const errorEvent = createErrorEvent(ev.error, this.tts);
+      this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
+    } else if (ev.type === 'llm_error') {
+      const errorEvent = createErrorEvent(ev.error, this.llm);
+      this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
+    }
+    this.agentSession._onError(ev);
+  }
+  // -- Realtime Session events --
+  onInputSpeechStarted(_ev: InputSpeechStartedEvent): void {
+    this.logger.info('onInputSpeechStarted');
+    if (!this.vad) {
+      this.agentSession._updateUserState('speaking');
+    }
+    // this.interrupt() is going to raise when allow_interruptions is False,
+    // llm.InputSpeechStartedEvent is only fired by the server when the turn_detection is enabled.
+    try {
+      this.interrupt();
+    } catch (error) {
+      this.logger.error(
+        'RealtimeAPI input_speech_started, but current speech is not interruptable, this should never happen!',
+        error,
+      );
+    }
+  }
+  onInputSpeechStopped(ev: InputSpeechStoppedEvent): void {
+    this.logger.info(ev, 'onInputSpeechStopped');
+    if (!this.vad) {
+      this.agentSession._updateUserState('listening');
+    }
+    if (ev.userTranscriptionEnabled) {
+      this.agentSession.emit(
+        AgentSessionEventTypes.UserInputTranscribed,
+        createUserInputTranscribedEvent({
+          isFinal: false,
+          transcript: '',
+        }),
+      );
+    }
+  }
+  onInputAudioTranscriptionCompleted(ev: InputTranscriptionCompleted): void {
+    this.agentSession.emit(
+      AgentSessionEventTypes.UserInputTranscribed,
+      createUserInputTranscribedEvent({
+        transcript: ev.transcript,
+        isFinal: ev.isFinal,
+      }),
+    );
+    if (ev.isFinal) {
+      const message = ChatMessage.create({
+        role: 'user',
+        content: ev.transcript,
+        id: ev.itemId,
+      });
+      this.agent._chatCtx.items.push(message);
+      this.agentSession._conversationItemAdded(message);
+    }
+  }
+  onGenerationCreated(ev: GenerationCreatedEvent): void {
+    if (ev.userInitiated) {
+      // user initiated generations are directly handled inside _realtime_reply_task
+      return;
+    }
+    if (this.draining) {
+      // copied from python:
+      // TODO(shubhra): should we "forward" this new turn to the next agent?
+      this.logger.warn('skipping new realtime generation, the agent is draining');
+      return;
+    }
+    const handle = SpeechHandle.create({
+      allowInterruptions: this.allowInterruptions,
+    });
+    this.agentSession.emit(
+      AgentSessionEventTypes.SpeechCreated,
+      createSpeechCreatedEvent({
+        userInitiated: false,
+        source: 'generate_reply',
+        speechHandle: handle,
+      }),
+    );
+    this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
+    this.createSpeechTask({
+      promise: this.realtimeGenerationTask(handle, ev, {}),
+      ownedSpeechHandle: handle,
+      name: 'AgentActivity.realtimeGeneration',
+    });
+    this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+  }
+  // recognition hooks
+  onStartOfSpeech(_ev: VADEvent): void {
+    this.agentSession._updateUserState('speaking');
+  }
+  onEndOfSpeech(_ev: VADEvent): void {
+    this.agentSession._updateUserState('listening');
+  }
+  onVADInferenceDone(ev: VADEvent): void {
+    if (this.turnDetection === 'manual' || this.turnDetection === 'realtime_llm') {
+      // skip speech handle interruption for manual and realtime model
+      return;
+    }
+    if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
+      // skip speech handle interruption if server side turn detection is enabled
+      return;
+    }
+    if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
+      return;
+    }
+    if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
+      const text = this.audioRecognition.currentTranscript;
+      // TODO(shubhra): better word splitting for multi-language
+      if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
+        return;
+      }
+    }
+    this.realtimeSession?.startUserActivity();
+    if (
+      this._currentSpeech &&
+      !this._currentSpeech.interrupted &&
+      this._currentSpeech.allowInterruptions
+    ) {
+      this.logger.info({ 'speech id': this._currentSpeech.id }, 'speech interrupted by VAD');
+      this.realtimeSession?.interrupt();
+      this._currentSpeech.interrupt();
+    }
+  }
+  onInterimTranscript(ev: SpeechEvent): void {
+    if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
+      // skip stt transcription if userTranscription is enabled on the realtime model
+      return;
+    }
+    this.agentSession.emit(
+      AgentSessionEventTypes.UserInputTranscribed,
+      createUserInputTranscribedEvent({
+        transcript: ev.alternatives![0].text,
+        isFinal: false,
+        // TODO(AJS-106): add multi participant support
+      }),
+    );
+  }
+  onFinalTranscript(ev: SpeechEvent): void {
+    if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
+      // skip stt transcription if userTranscription is enabled on the realtime model
+      return;
+    }
+    this.agentSession.emit(
+      AgentSessionEventTypes.UserInputTranscribed,
+      createUserInputTranscribedEvent({
+        transcript: ev.alternatives![0].text,
+        isFinal: true,
+        // TODO(AJS-106): add multi participant support
+      }),
+    );
+  }
+  private createSpeechTask<T>(options: {
+    promise: Promise<T>;
+    ownedSpeechHandle?: SpeechHandle;
+    name?: string;
+  }): Promise<T> {
+    const { promise, ownedSpeechHandle } = options;
+    this.speechTasks.add(promise);
+    promise.finally(() => {
+      this.speechTasks.delete(promise);
+      if (ownedSpeechHandle) {
+        ownedSpeechHandle._markPlayoutDone();
+      }
+      this.wakeupMainTask();
+    });
+    return promise;
+  }
+  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
+    if (this.draining) {
+      this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
+      // copied from python:
+      // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
+      return true;
+    }
+    if (
+      this.stt &&
+      this.turnDetection !== 'manual' &&
+      this._currentSpeech &&
+      this._currentSpeech.allowInterruptions &&
+      !this._currentSpeech.interrupted &&
+      this.agentSession.options.minInterruptionWords > 0 &&
+      info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
+    ) {
+      // avoid interruption if the new_transcript is too short
+      this.logger.info('skipping user input, new_transcript is too short');
+      return false;
+    }
+    const oldTask = this._userTurnCompletedTask;
+    this._userTurnCompletedTask = this.createSpeechTask({
+      promise: this.userTurnCompleted(info, oldTask),
+      name: 'AgentActivity.userTurnCompleted',
+    });
+    return true;
+  }
+  retrieveChatCtx(): ChatContext {
+    return this.agentSession.chatCtx;
+  }
+  private async mainTask(signal: AbortSignal): Promise<void> {
+    const abortFuture = new Future();
+    const abortHandler = () => {
+      abortFuture.resolve();
+      signal.removeEventListener('abort', abortHandler);
+    };
+    signal.addEventListener('abort', abortHandler);
+    while (true) {
+      await Promise.race([this.q_updated.await, abortFuture.await]);
+      if (signal.aborted) break;
+      while (this.speechQueue.size() > 0) {
+        if (signal.aborted) break;
+        const heapItem = this.speechQueue.pop();
+        if (!heapItem) {
+          throw new Error('Speech queue is empty');
+        }
+        const speechHandle = heapItem[2];
+        this._currentSpeech = speechHandle;
+        speechHandle._authorizePlayout();
+        await speechHandle.waitForPlayout();
+        this._currentSpeech = undefined;
+      }
+      // If we're draining and there are no more speech tasks, we can exit.
+      // Only speech tasks can bypass draining to create a tool response
+      if (this.draining && this.speechTasks.size === 0) {
+        this.logger.info('mainTask: draining and no more speech tasks');
+        break;
+      }
+      this.q_updated = new Future();
+    }
+    this.logger.info('AgentActivity mainTask: exiting');
+  }
+  private wakeupMainTask(): void {
+    this.q_updated.resolve();
+  }
+  generateReply(options: {
+    userMessage?: ChatMessage;
+    chatCtx?: ChatContext;
+    instructions?: string;
+    toolChoice?: ToolChoice | null;
+    allowInterruptions?: boolean;
+  }): SpeechHandle {
+    const {
+      userMessage,
+      chatCtx,
+      instructions: defaultInstructions,
+      toolChoice: defaultToolChoice,
+      allowInterruptions: defaultAllowInterruptions,
+    } = options;
+    let instructions = defaultInstructions;
+    let toolChoice = defaultToolChoice;
+    let allowInterruptions = defaultAllowInterruptions;
+    if (
+      this.llm instanceof RealtimeModel &&
+      this.llm.capabilities.turnDetection &&
+      allowInterruptions === false
+    ) {
+      this.logger.warn(
+        'the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false when using VoiceAgent.generateReply(), ' +
+          'disable turnDetection in the RealtimeModel and use VAD on the AgentTask/VoiceAgent instead',
+      );
+      allowInterruptions = true;
+    }
+    if (this.llm === undefined) {
+      throw new Error('trying to generate reply without an LLM model');
+    }
+    const functionCall = asyncLocalStorage.getStore()?.functionCall;
+    if (toolChoice === undefined && functionCall !== undefined) {
+      // when generateReply is called inside a tool, set toolChoice to 'none' by default
+      toolChoice = 'none';
+    }
+    const handle = SpeechHandle.create({
+      allowInterruptions: allowInterruptions ?? this.allowInterruptions,
+    });
+    this.agentSession.emit(
+      AgentSessionEventTypes.SpeechCreated,
+      createSpeechCreatedEvent({
+        userInitiated: true,
+        source: 'generate_reply',
+        speechHandle: handle,
+      }),
+    );
+    this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
+    if (this.llm instanceof RealtimeModel) {
+      this.createSpeechTask({
+        promise: this.realtimeReplyTask({
+          speechHandle: handle,
+          // TODO(brian): support llm.ChatMessage for the realtime model
+          userInput: userMessage?.textContent,
+          instructions,
+          modelSettings: {
+            // isGiven(toolChoice) = toolChoice !== undefined
+            toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
+          },
+        }),
+        ownedSpeechHandle: handle,
+        name: 'AgentActivity.realtimeReply',
+      });
+    } else if (this.llm instanceof LLM) {
+      // instructions used inside generateReply are "extra" instructions.
+      // this matches the behavior of the Realtime API:
+      // https://platform.openai.com/docs/api-reference/realtime-client-events/response/create
+      if (instructions) {
+        instructions = `${this.agent.instructions}\n${instructions}`;
+      }
+      const task = this.createSpeechTask({
+        promise: this.pipelineReplyTask(
+          handle,
+          chatCtx ?? this.agent.chatCtx,
+          this.agent.toolCtx,
+          { toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice) },
+          instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
+          userMessage,
+        ),
+        ownedSpeechHandle: handle,
+        name: 'AgentActivity.pipelineReply',
+      });
+      task.finally(() => this.onPipelineReplyDone());
+    }
+    this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    return handle;
+  }
+  interrupt(): Future<void> {
+    const future = new Future<void>();
+    const currentSpeech = this._currentSpeech;
+    currentSpeech?.interrupt();
+    for (const [_, __, speech] of this.speechQueue) {
+      speech.interrupt();
+    }
+    this.realtimeSession?.interrupt();
+    if (currentSpeech === undefined) {
+      future.resolve();
+    } else {
+      currentSpeech.then(() => {
+        if (future.done) return;
+        future.resolve();
+      });
+    }
+    return future;
+  }
+  private onPipelineReplyDone(): void {
+    if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
+      this.agentSession._updateAgentState('listening');
+    }
+  }
+  private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise<void>): Promise<void> {
+    if (oldTask) {
+      // We never cancel user code as this is very confusing.
+      // So we wait for the old execution of onUserTurnCompleted to finish.
+      // In practice this is OK because most speeches will be interrupted if a new turn
+      // is detected. So the previous execution should complete quickly.
+      await oldTask;
+    }
+    // When the audio recognition detects the end of a user turn:
+    //  - check if realtime model server-side turn detection is enabled
+    //  - check if there is no current generation happening
+    //  - cancel the current generation if it allows interruptions (otherwise skip this current
+    //  turn)
+    //  - generate a reply to the user input
+    if (this.llm instanceof RealtimeModel) {
+      if (this.llm.capabilities.turnDetection) {
+        return;
+      }
+      this.realtimeSession?.commitAudio();
+    }
+    if (this._currentSpeech) {
+      if (!this._currentSpeech.allowInterruptions) {
+        this.logger.warn(
+          { user_input: info.newTranscript },
+          'skipping user input, current speech generation cannot be interrupted',
+        );
+        return;
+      }
+      this.logger.info(
+        { 'speech id': this._currentSpeech.id },
+        'speech interrupted, new user turn detected',
+      );
+      this._currentSpeech.interrupt();
+      this.realtimeSession?.interrupt();
+    }
+    let userMessage: ChatMessage | undefined = ChatMessage.create({
+      role: 'user',
+      content: info.newTranscript,
+    });
+    // create a temporary mutable chat context to pass to onUserTurnCompleted
+    // the user can edit it for the current generation, but changes will not be kept inside the
+    // Agent.chatCtx
+    const chatCtx = this.agent.chatCtx.copy();
+    const startTime = Date.now();
+    try {
+      await this.agent.onUserTurnCompleted(chatCtx, userMessage);
+    } catch (e) {
+      if (e instanceof StopResponse) {
+        return;
+      }
+      this.logger.error({ error: e }, 'error occurred during onUserTurnCompleted');
+    }
+    const callbackDuration = Date.now() - startTime;
+    if (this.llm instanceof RealtimeModel) {
+      // ignore stt transcription for realtime model
+      userMessage = undefined;
+    } else if (this.llm === undefined) {
+      return;
+    }
+    // Ensure the new message is passed to generateReply
+    // This preserves the original message id, making it easier for users to track responses
+    const speechHandle = this.generateReply({ userMessage, chatCtx });
+    const eouMetrics: EOUMetrics = {
+      type: 'eou_metrics',
+      timestamp: Date.now(),
+      endOfUtteranceDelay: info.endOfUtteranceDelay,
+      transcriptionDelay: info.transcriptionDelay,
+      onUserTurnCompletedDelay: callbackDuration,
+      speechId: speechHandle.id,
+    };
+    this.agentSession.emit(
+      AgentSessionEventTypes.MetricsCollected,
+      createMetricsCollectedEvent({ metrics: eouMetrics }),
+    );
+  }
+  private async ttsTask(
+    speechHandle: SpeechHandle,
+    text: string | ReadableStream<string>,
+    addToChatCtx: boolean,
+    modelSettings: ModelSettings,
+    audio?: ReadableStream<AudioFrame> | null,
+  ): Promise<void> {
+    speechHandleStorage.enterWith(speechHandle);
+    const transcriptionOutput = this.agentSession.output.transcriptionEnabled
+      ? this.agentSession.output.transcription
+      : null;
+    const audioOutput = this.agentSession.output.audioEnabled
+      ? this.agentSession.output.audio
+      : null;
+    const replyAbortController = new AbortController();
+    await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    if (speechHandle.interrupted) {
+      return;
+    }
+    let baseStream: ReadableStream<string>;
+    if (text instanceof ReadableStream) {
+      baseStream = text;
+    } else {
+      baseStream = new ReadableStream({
+        start(controller) {
+          controller.enqueue(text);
+          controller.close();
+        },
+      });
+    }
+    const [textSource, audioSource] = baseStream.tee();
+    const tasks: Array<Task<void>> = [];
+    const trNode = await this.agent.transcriptionNode(textSource, {});
+    let textOut: _TextOut | null = null;
+    if (trNode) {
+      const [textForwardTask, _textOut] = performTextForwarding(
+        trNode,
+        replyAbortController,
+        transcriptionOutput,
+      );
+      textOut = _textOut;
+      tasks.push(textForwardTask);
+    }
+    const onFirstFrame = () => {
+      this.agentSession._updateAgentState('speaking');
+    };
+    if (!audioOutput) {
+      if (textOut) {
+        textOut.firstTextFut.await.finally(onFirstFrame);
+      }
+    } else {
+      let audioOut: _AudioOut | null = null;
+      if (!audio) {
+        // generate audio using TTS
+        const [ttsTask, ttsStream] = performTTSInference(
+          (...args) => this.agent.ttsNode(...args),
+          audioSource,
+          modelSettings,
+          replyAbortController,
+        );
+        tasks.push(ttsTask);
+        const [forwardTask, _audioOut] = performAudioForwarding(
+          ttsStream,
+          audioOutput,
+          replyAbortController,
+        );
+        tasks.push(forwardTask);
+        audioOut = _audioOut;
+      } else {
+        // use the provided audio
+        const [forwardTask, _audioOut] = performAudioForwarding(
+          audio,
+          audioOutput,
+          replyAbortController,
+        );
+        tasks.push(forwardTask);
+        audioOut = _audioOut;
+      }
+      audioOut.firstFrameFut.await.finally(onFirstFrame);
+    }
+    await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
+    if (audioOutput) {
+      await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
+    }
+    if (speechHandle.interrupted) {
+      replyAbortController.abort();
+      await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
+      if (audioOutput) {
+        audioOutput.clearBuffer();
+        await audioOutput.waitForPlayout();
+      }
+    }
+    if (addToChatCtx) {
+      const message = ChatMessage.create({
+        role: 'assistant',
+        content: textOut?.text || '',
+        interrupted: speechHandle.interrupted,
+      });
+      this.agent._chatCtx.insert(message);
+      this.agentSession._conversationItemAdded(message);
+    }
+    if (this.agentSession.agentState === 'speaking') {
+      this.agentSession._updateAgentState('listening');
+    }
+  }
+  private async pipelineReplyTask(
+    speechHandle: SpeechHandle,
+    chatCtx: ChatContext,
+    toolCtx: ToolContext,
+    modelSettings: ModelSettings,
+    instructions?: string,
+    newMessage?: ChatMessage,
+    toolsMessages?: ChatItem[],
+  ): Promise<void> {
+    speechHandleStorage.enterWith(speechHandle);
+    const replyAbortController = new AbortController();
+    const audioOutput = this.agentSession.output.audioEnabled
+      ? this.agentSession.output.audio
+      : null;
+    const transcriptionOutput = this.agentSession.output.transcriptionEnabled
+      ? this.agentSession.output.transcription
+      : null;
+    chatCtx = chatCtx.copy();
+    if (newMessage) {
+      chatCtx.insert(newMessage);
+      this.agent._chatCtx.insert(newMessage);
+      this.agentSession._conversationItemAdded(newMessage);
+    }
+    if (instructions) {
+      try {
+        updateInstructions({
+          chatCtx,
+          instructions,
+          addIfMissing: true,
+        });
+      } catch (e) {
+        this.logger.error({ error: e }, 'error occurred during updateInstructions');
+      }
+    }
+    this.agentSession._updateAgentState('thinking');
+    const tasks: Array<Task<void>> = [];
+    const [llmTask, llmGenData] = performLLMInference(
+      // preserve  `this` context in llmNode
+      (...args) => this.agent.llmNode(...args),
+      chatCtx,
+      toolCtx,
+      modelSettings,
+      replyAbortController,
+    );
+    tasks.push(llmTask);
+    const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
+    let ttsTask: Task<void> | null = null;
+    let ttsStream: ReadableStream<AudioFrame> | null = null;
+    if (audioOutput) {
+      [ttsTask, ttsStream] = performTTSInference(
+        (...args) => this.agent.ttsNode(...args),
+        ttsTextInput,
+        modelSettings,
+        replyAbortController,
+      );
+      tasks.push(ttsTask);
+    }
+    await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    if (speechHandle.interrupted) {
+      replyAbortController.abort();
+      await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
+      return;
+    }
+    const replyStartedAt = Date.now();
+    const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
+    let textOut: _TextOut | null = null;
+    if (trNodeResult) {
+      const [textForwardTask, _textOut] = performTextForwarding(
+        trNodeResult,
+        replyAbortController,
+        transcriptionOutput,
+      );
+      tasks.push(textForwardTask);
+      textOut = _textOut;
+    }
+    const onFirstFrame = () => {
+      this.agentSession._updateAgentState('speaking');
+    };
+    let audioOut: _AudioOut | null = null;
+    if (audioOutput) {
+      if (ttsStream) {
+        const [forwardTask, _audioOut] = performAudioForwarding(
+          ttsStream,
+          audioOutput,
+          replyAbortController,
+        );
+        audioOut = _audioOut;
+        tasks.push(forwardTask);
+        audioOut.firstFrameFut.await.finally(onFirstFrame);
+      } else {
+        throw Error('ttsStream is null when audioOutput is enabled');
+      }
+    } else {
+      textOut?.firstTextFut.await.finally(onFirstFrame);
+    }
+    const onToolExecutionStarted = (_: FunctionCall) => {
+      // TODO(brian): handle speech_handle item_added
+    };
+    const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
+      // TODO(brian): handle speech_handle item_added
+    };
+    const [executeToolsTask, toolOutput] = performToolExecutions({
+      session: this.agentSession,
+      speechHandle,
+      toolCtx,
+      toolChoice: modelSettings.toolChoice,
+      toolCallStream: llmGenData.toolCallStream,
+      controller: replyAbortController,
+      onToolExecutionStarted,
+      onToolExecutionCompleted,
+    });
+    tasks.push(executeToolsTask);
+    await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
+    if (audioOutput) {
+      await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
+    }
+    // add the tools messages that triggers this reply to the chat context
+    if (toolsMessages) {
+      for (const msg of toolsMessages) {
+        msg.createdAt = replyStartedAt;
+      }
+      this.agent._chatCtx.insert(toolsMessages);
+    }
+    if (speechHandle.interrupted) {
+      this.logger.debug(
+        { speech_id: speechHandle.id },
+        'Aborting all pipeline reply tasks due to interruption',
+      );
+      replyAbortController.abort();
+      await Promise.allSettled(
+        tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
+      );
+      let forwardedText = textOut?.text || '';
+      if (audioOutput) {
+        audioOutput.clearBuffer();
+        const playbackEv = await audioOutput.waitForPlayout();
+        if (audioOut?.firstFrameFut.done) {
+          // playback EV is valid only if the first frame was already played
+          this.logger.info(
+            { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+            'playout interrupted',
+          );
+          if (playbackEv.synchronizedTranscript) {
+            forwardedText = playbackEv.synchronizedTranscript;
+          }
+        } else {
+          forwardedText = '';
+        }
+      }
+      if (forwardedText) {
+        const message = ChatMessage.create({
+          role: 'assistant',
+          content: forwardedText,
+          id: llmGenData.id,
+          interrupted: true,
+          createdAt: replyStartedAt,
+        });
+        chatCtx.insert(message);
+        this.agent._chatCtx.insert(message);
+        this.agentSession._conversationItemAdded(message);
+      }
+      if (this.agentSession.agentState === 'speaking') {
+        this.agentSession._updateAgentState('listening');
+      }
+      this.logger.info(
+        { speech_id: speechHandle.id, message: forwardedText },
+        'playout completed with interrupt',
+      );
+      // TODO(shubhra) add chat message to speech handle
+      speechHandle._markPlayoutDone();
+      await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
+      return;
+    }
+    if (textOut && textOut.text) {
+      const message = ChatMessage.create({
+        role: 'assistant',
+        id: llmGenData.id,
+        interrupted: false,
+        createdAt: replyStartedAt,
+        content: textOut.text,
+      });
+      chatCtx.insert(message);
+      this.agent._chatCtx.insert(message);
+      this.agentSession._conversationItemAdded(message);
+      this.logger.info(
+        { speech_id: speechHandle.id, message: textOut.text },
+        'playout completed without interruption',
+      );
+    }
+    if (toolOutput.output.length > 0) {
+      this.agentSession._updateAgentState('thinking');
+    } else if (this.agentSession.agentState === 'speaking') {
+      this.agentSession._updateAgentState('listening');
+    }
+    speechHandle._markPlayoutDone();
+    await executeToolsTask.result;
+    if (toolOutput.output.length === 0) return;
+    // important: no agent output should be used after this point
+    const { maxToolSteps } = this.agentSession.options;
+    if (speechHandle.stepIndex >= maxToolSteps) {
+      this.logger.warn(
+        { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
+        'maximum number of function calls steps reached',
+      );
+      return;
+    }
+    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
+      functionCalls: [],
+      functionCallOutputs: [],
+    });
+    let shouldGenerateToolReply: boolean = false;
+    let newAgentTask: Agent | null = null;
+    let ignoreTaskSwitch: boolean = false;
+    for (const sanitizedOut of toolOutput.output) {
+      if (sanitizedOut.toolCallOutput !== undefined) {
+        functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
+        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
+        if (sanitizedOut.replyRequired) {
+          shouldGenerateToolReply = true;
+        }
+      }
+      if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
+        this.logger.error('expected to receive only one agent task from the tool executions');
+        ignoreTaskSwitch = true;
+        // TODO(brian): should we mark the function call as failed to notify the LLM?
+      }
+      newAgentTask = sanitizedOut.agentTask ?? null;
+      this.logger.debug(
+        {
+          speechId: speechHandle.id,
+          name: sanitizedOut.toolCall?.name,
+          args: sanitizedOut.toolCall.args,
+          output: sanitizedOut.toolCallOutput?.output,
+          isError: sanitizedOut.toolCallOutput?.isError,
+        },
+        'Tool call execution finished',
+      );
+    }
+    this.agentSession.emit(
+      AgentSessionEventTypes.FunctionToolsExecuted,
+      functionToolsExecutedEvent,
+    );
+    let draining = this.draining;
+    if (!ignoreTaskSwitch && newAgentTask !== null) {
+      this.agentSession.updateAgent(newAgentTask);
+      draining = true;
+    }
+    const toolMessages = [
+      ...functionToolsExecutedEvent.functionCalls,
+      ...functionToolsExecutedEvent.functionCallOutputs,
+    ] as ChatItem[];
+    if (shouldGenerateToolReply) {
+      chatCtx.insert(toolMessages);
+      const handle = SpeechHandle.create({
+        allowInterruptions: speechHandle.allowInterruptions,
+        stepIndex: speechHandle.stepIndex + 1,
+        parent: speechHandle,
+      });
+      this.agentSession.emit(
+        AgentSessionEventTypes.SpeechCreated,
+        createSpeechCreatedEvent({
+          userInitiated: false,
+          source: 'tool_response',
+          speechHandle: handle,
+        }),
+      );
+      // Avoid setting tool_choice to "required" or a specific function when
+      // passing tool response back to the LLM
+      const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
+      const toolResponseTask = this.createSpeechTask({
+        promise: this.pipelineReplyTask(
+          handle,
+          chatCtx,
+          toolCtx,
+          { toolChoice: respondToolChoice },
+          instructions,
+          undefined,
+          toolMessages,
+        ),
+        ownedSpeechHandle: handle,
+        name: 'AgentActivity.pipelineReply',
+      });
+      toolResponseTask.finally(() => this.onPipelineReplyDone());
+      this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
+    } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
+      for (const msg of toolMessages) {
+        msg.createdAt = replyStartedAt;
+      }
+      this.agent._chatCtx.insert(toolMessages);
+    }
+  }
+  private async realtimeGenerationTask(
+    speechHandle: SpeechHandle,
+    ev: GenerationCreatedEvent,
+    modelSettings: ModelSettings,
+  ): Promise<void> {
+    speechHandleStorage.enterWith(speechHandle);
+    if (!this.realtimeSession) {
+      throw new Error('realtime session is not initialized');
+    }
+    if (!(this.llm instanceof RealtimeModel)) {
+      throw new Error('llm is not a realtime model');
+    }
+    this.logger.debug(
+      { speech_id: speechHandle.id, stepIndex: speechHandle.stepIndex },
+      'realtime generation started',
+    );
+    const audioOutput = this.agentSession.output.audioEnabled
+      ? this.agentSession.output.audio
+      : null;
+    const textOutput = this.agentSession.output.transcriptionEnabled
+      ? this.agentSession.output.transcription
+      : null;
+    const toolCtx = this.realtimeSession.tools;
+    await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    if (speechHandle.interrupted) {
+      return;
+    }
+    const onFirstFrame = () => {
+      this.agentSession._updateAgentState('speaking');
+    };
+    const replyAbortController = new AbortController();
+    const readMessages = async (
+      abortController: AbortController,
+      outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
+    ) => {
+      const forwardTasks: Array<Task<void>> = [];
+      try {
+        for await (const msg of ev.messageStream) {
+          if (forwardTasks.length > 0) {
+            this.logger.warn(
+              'expected to receive only one message generation from the realtime API',
+            );
+            break;
+          }
+          const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
+          let textOut: _TextOut | null = null;
+          if (trNodeResult) {
+            const [textForwardTask, _textOut] = performTextForwarding(
+              trNodeResult,
+              abortController,
+              textOutput,
+            );
+            forwardTasks.push(textForwardTask);
+            textOut = _textOut;
+          }
+          let audioOut: _AudioOut | null = null;
+          if (audioOutput) {
+            const realtimeAudio = await this.agent.realtimeAudioOutputNode(
+              msg.audioStream,
+              modelSettings,
+            );
+            if (realtimeAudio) {
+              const [forwardTask, _audioOut] = performAudioForwarding(
+                realtimeAudio,
+                audioOutput,
+                abortController,
+              );
+              forwardTasks.push(forwardTask);
+              audioOut = _audioOut;
+              audioOut.firstFrameFut.await.finally(onFirstFrame);
+            } else {
+              this.logger.warn(
+                'audio output is enabled but neither tts nor realtime audio is available',
+              );
+            }
+          } else if (textOut) {
+            textOut.firstTextFut.await.finally(onFirstFrame);
+          }
+          outputs.push([msg.messageId, textOut, audioOut]);
+        }
+        await waitFor(forwardTasks);
+      } catch (error) {
+        this.logger.error(error, 'error reading messages from the realtime API');
+      } finally {
+        await cancelAndWait(forwardTasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
+      }
+    };
+    const messageOutputs: Array<[string, _TextOut | null, _AudioOut | null]> = [];
+    const tasks = [
+      Task.from(
+        (controller) => readMessages(controller, messageOutputs),
+        replyAbortController,
+        'AgentActivity.realtime_generation.read_messages',
+      ),
+    ];
+    const [toolCallStream, toolCallStreamForTracing] = ev.functionStream.tee();
+    // TODO(brian): append to tracing tees
+    const toolCalls: FunctionCall[] = [];
+    const readToolStreamTask = async (
+      controller: AbortController,
+      stream: ReadableStream<FunctionCall>,
+    ) => {
+      const reader = stream.getReader();
+      try {
+        while (!controller.signal.aborted) {
+          const { done, value } = await reader.read();
+          if (done) break;
+          this.logger.debug({ tool_call: value }, 'received tool call from the realtime API');
+          toolCalls.push(value);
+        }
+      } finally {
+        reader.releaseLock();
+      }
+    };
+    tasks.push(
+      Task.from(
+        (controller) => readToolStreamTask(controller, toolCallStreamForTracing),
+        replyAbortController,
+        'AgentActivity.realtime_generation.read_tool_stream',
+      ),
+    );
+    const onToolExecutionStarted = (_: FunctionCall) => {
+      // TODO(brian): handle speech_handle item_added
+    };
+    const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
+      // TODO(brian): handle speech_handle item_added
+    };
+    const [executeToolsTask, toolOutput] = performToolExecutions({
+      session: this.agentSession,
+      speechHandle,
+      toolCtx,
+      toolCallStream,
+      toolChoice: modelSettings.toolChoice,
+      controller: replyAbortController,
+      onToolExecutionStarted,
+      onToolExecutionCompleted,
+    });
+    await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
+    // TODO(brian): add tracing span
+    if (audioOutput) {
+      await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
+      this.agentSession._updateAgentState('listening');
+    }
+    if (speechHandle.interrupted) {
+      this.logger.debug(
+        { speech_id: speechHandle.id },
+        'Aborting all realtime generation tasks due to interruption',
+      );
+      replyAbortController.abort();
+      await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
+      if (messageOutputs.length > 0) {
+        // there should be only one message
+        const [msgId, textOut, audioOut] = messageOutputs[0]!;
+        let forwardedText = textOut?.text || '';
+        if (audioOutput) {
+          audioOutput.clearBuffer();
+          const playbackEv = await audioOutput.waitForPlayout();
+          let playbackPosition = playbackEv.playbackPosition;
+          if (audioOut?.firstFrameFut.done) {
+            // playback EV is valid only if the first frame was already played
+            this.logger.info(
+              { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+              'playout interrupted',
+            );
+            if (playbackEv.synchronizedTranscript) {
+              forwardedText = playbackEv.synchronizedTranscript;
+            }
+          } else {
+            forwardedText = '';
+            playbackPosition = 0;
+          }
+          // truncate server-side message
+          this.realtimeSession.truncate({
+            messageId: msgId,
+            audioEndMs: Math.floor(playbackPosition),
+          });
+        }
+        if (forwardedText) {
+          const message = ChatMessage.create({
+            role: 'assistant',
+            content: forwardedText,
+            id: msgId,
+            interrupted: true,
+          });
+          this.agent._chatCtx.insert(message);
+          speechHandle._setChatMessage(message);
+          this.agentSession._conversationItemAdded(message);
+          // TODO(brian): add tracing span
+        }
+        this.logger.info(
+          { speech_id: speechHandle.id, message: forwardedText },
+          'playout completed with interrupt',
+        );
+      }
+      // TODO(shubhra) add chat message to speech handle
+      speechHandle._markPlayoutDone();
+      await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
+      // TODO(brian): close tees
+      return;
+    }
+    if (messageOutputs.length > 0) {
+      // there should be only one message
+      const [msgId, textOut, _] = messageOutputs[0]!;
+      const message = ChatMessage.create({
+        role: 'assistant',
+        content: textOut?.text || '',
+        id: msgId,
+        interrupted: false,
+      });
+      this.agent._chatCtx.insert(message);
+      speechHandle._setChatMessage(message);
+      this.agentSession._conversationItemAdded(message); // mark the playout done before waiting for the tool execution\
+      // TODO(brian): add tracing span
+    }
+    // mark the playout done before waiting for the tool execution
+    speechHandle._markPlayoutDone();
+    // TODO(brian): close tees
+    toolOutput.firstToolStartedFuture.await.finally(() => {
+      this.agentSession._updateAgentState('thinking');
+    });
+    await executeToolsTask.result;
+    if (toolOutput.output.length === 0) return;
+    // important: no agent ouput should be used after this point
+    const { maxToolSteps } = this.agentSession.options;
+    if (speechHandle.stepIndex >= maxToolSteps) {
+      this.logger.warn(
+        { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
+        'maximum number of function calls steps reached',
+      );
+      return;
+    }
+    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
+      functionCalls: [],
+      functionCallOutputs: [],
+    });
+    let shouldGenerateToolReply: boolean = false;
+    let newAgentTask: Agent | null = null;
+    let ignoreTaskSwitch: boolean = false;
+    for (const sanitizedOut of toolOutput.output) {
+      if (sanitizedOut.toolCallOutput !== undefined) {
+        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
+        if (sanitizedOut.replyRequired) {
+          shouldGenerateToolReply = true;
+        }
+      }
+      if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
+        this.logger.error('expected to receive only one agent task from the tool executions');
+        ignoreTaskSwitch = true;
+      }
+      newAgentTask = sanitizedOut.agentTask ?? null;
+      this.logger.debug(
+        {
+          speechId: speechHandle.id,
+          name: sanitizedOut.toolCall?.name,
+          args: sanitizedOut.toolCall.args,
+          output: sanitizedOut.toolCallOutput?.output,
+          isError: sanitizedOut.toolCallOutput?.isError,
+        },
+        'Tool call execution finished',
+      );
+    }
+    this.agentSession.emit(
+      AgentSessionEventTypes.FunctionToolsExecuted,
+      functionToolsExecutedEvent,
+    );
+    let draining = this.draining;
+    if (!ignoreTaskSwitch && newAgentTask !== null) {
+      this.agentSession.updateAgent(newAgentTask);
+      draining = true;
+    }
+    if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
+      const chatCtx = this.realtimeSession.chatCtx.copy();
+      chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
+      try {
+        await this.realtimeSession.updateChatCtx(chatCtx);
+      } catch (error) {
+        this.logger.warn(
+          { error },
+          'failed to update chat context before generating the function calls results',
+        );
+      }
+    }
+    // skip realtime reply if not required or auto-generated
+    if (!shouldGenerateToolReply || this.llm.capabilities.autoToolReplyGeneration) {
+      return;
+    }
+    this.realtimeSession.interrupt();
+    const replySpeechHandle = SpeechHandle.create({
+      allowInterruptions: speechHandle.allowInterruptions,
+      stepIndex: speechHandle.stepIndex + 1,
+      parent: speechHandle,
+    });
+    this.agentSession.emit(
+      AgentSessionEventTypes.SpeechCreated,
+      createSpeechCreatedEvent({
+        userInitiated: false,
+        source: 'tool_response',
+        speechHandle: replySpeechHandle,
+      }),
+    );
+    const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
+    this.createSpeechTask({
+      promise: this.realtimeReplyTask({
+        speechHandle: replySpeechHandle,
+        modelSettings: { toolChoice },
+      }),
+      ownedSpeechHandle: replySpeechHandle,
+      name: 'AgentActivity.realtime_reply',
+    });
+    this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
+  }
+  private async realtimeReplyTask({
+    speechHandle,
+    modelSettings: { toolChoice },
+    userInput,
+    instructions,
+  }: {
+    speechHandle: SpeechHandle;
+    modelSettings: ModelSettings;
+    userInput?: string;
+    instructions?: string;
+  }): Promise<void> {
+    speechHandleStorage.enterWith(speechHandle);
+    if (!this.realtimeSession) {
+      throw new Error('realtime session is not available');
+    }
+    await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    if (userInput) {
+      const chatCtx = this.realtimeSession.chatCtx.copy();
+      const message = chatCtx.addMessage({
+        role: 'user',
+        content: userInput,
+      });
+      await this.realtimeSession.updateChatCtx(chatCtx);
+      this.agent._chatCtx.insert(message);
+      this.agentSession._conversationItemAdded(message);
+    }
+    const originalToolChoice = this.toolChoice;
+    if (toolChoice !== undefined) {
+      this.realtimeSession.updateOptions({ toolChoice });
+    }
+    try {
+      const generationEvent = await this.realtimeSession.generateReply(instructions);
+      await this.realtimeGenerationTask(speechHandle, generationEvent, { toolChoice });
+    } finally {
+      // reset toolChoice value
+      if (toolChoice !== undefined && toolChoice !== originalToolChoice) {
+        this.realtimeSession.updateOptions({ toolChoice: originalToolChoice });
+      }
+    }
+  }
+  private scheduleSpeech(
+    speechHandle: SpeechHandle,
+    priority: number,
+    bypassDraining: boolean = false,
+  ): void {
+    if (this.draining && !bypassDraining) {
+      throw new Error('cannot schedule new speech, the agent is draining');
+    }
+    // Monotonic time to avoid near 0 collisions
+    this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
+    this.wakeupMainTask();
+  }
+  async drain(): Promise<void> {
+    const unlock = await this.lock.lock();
+    try {
+      if (this._draining) return;
+      this.createSpeechTask({
+        promise: this.agent.onExit(),
+        name: 'AgentActivity_onExit',
+      });
+      this.wakeupMainTask();
+      this._draining = true;
+      await this._mainTask?.result;
+    } finally {
+      unlock();
+    }
+  }
+  async close(): Promise<void> {
+    const unlock = await this.lock.lock();
+    try {
+      if (!this._draining) {
+        this.logger.warn('task closing without draining');
+      }
+      // Unregister event handlers to prevent duplicate metrics
+      if (this.llm instanceof LLM) {
+        this.llm.off('metrics_collected', this.onMetricsCollected);
+      }
+      if (this.realtimeSession) {
+        this.realtimeSession.off('generation_created', this.onGenerationCreated);
+        this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
+        this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
+        this.realtimeSession.off(
+          'input_audio_transcription_completed',
+          this.onInputAudioTranscriptionCompleted,
+        );
+        this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
+      }
+      if (this.stt instanceof STT) {
+        this.stt.off('metrics_collected', this.onMetricsCollected);
+      }
+      if (this.tts instanceof TTS) {
+        this.tts.off('metrics_collected', this.onMetricsCollected);
+      }
+      if (this.vad instanceof VAD) {
+        this.vad.off('metrics_collected', this.onMetricsCollected);
+      }
+      this.detachAudioInput();
+      await this.realtimeSession?.close();
+      await this.audioRecognition?.close();
+      await this._mainTask?.cancelAndWait();
+    } finally {
+      unlock();
+    }
+  }
+}
+function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {
+  // we convert null to undefined, which maps to the default provider tool choice value
+  return toolChoice !== null ? toolChoice : undefined;
+}