npm - @livekit/agents - Versions diffs - 1.1.0 → 1.2.1 - Mend

@livekit/agents 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (959) hide show

package/dist/_exceptions.cjs.map +1 -1
package/dist/_exceptions.d.ts.map +1 -1
package/dist/_exceptions.js.map +1 -1
package/dist/audio.cjs +89 -3
package/dist/audio.cjs.map +1 -1
package/dist/audio.d.cts +36 -1
package/dist/audio.d.ts +36 -1
package/dist/audio.d.ts.map +1 -1
package/dist/audio.js +76 -2
package/dist/audio.js.map +1 -1
package/dist/beta/index.cjs +29 -0
package/dist/beta/index.cjs.map +1 -0
package/dist/beta/index.d.cts +2 -0
package/dist/beta/index.d.ts +2 -0
package/dist/beta/index.d.ts.map +1 -0
package/dist/beta/index.js +7 -0
package/dist/beta/index.js.map +1 -0
package/dist/beta/workflows/index.cjs +29 -0
package/dist/beta/workflows/index.cjs.map +1 -0
package/dist/beta/workflows/index.d.cts +2 -0
package/dist/beta/workflows/index.d.ts +2 -0
package/dist/beta/workflows/index.d.ts.map +1 -0
package/dist/beta/workflows/index.js +7 -0
package/dist/beta/workflows/index.js.map +1 -0
package/dist/beta/workflows/task_group.cjs +165 -0
package/dist/beta/workflows/task_group.cjs.map +1 -0
package/dist/beta/workflows/task_group.d.cts +32 -0
package/dist/beta/workflows/task_group.d.ts +32 -0
package/dist/beta/workflows/task_group.d.ts.map +1 -0
package/dist/beta/workflows/task_group.js +141 -0
package/dist/beta/workflows/task_group.js.map +1 -0
package/dist/cli.cjs +44 -46
package/dist/cli.cjs.map +1 -1
package/dist/cli.d.cts +3 -3
package/dist/cli.d.ts +3 -3
package/dist/cli.d.ts.map +1 -1
package/dist/cli.js +45 -47
package/dist/cli.js.map +1 -1
package/dist/connection_pool.cjs +242 -0
package/dist/connection_pool.cjs.map +1 -0
package/dist/connection_pool.d.cts +123 -0
package/dist/connection_pool.d.ts +123 -0
package/dist/connection_pool.d.ts.map +1 -0
package/dist/connection_pool.js +218 -0
package/dist/connection_pool.js.map +1 -0
package/dist/connection_pool.test.cjs +256 -0
package/dist/connection_pool.test.cjs.map +1 -0
package/dist/connection_pool.test.js +255 -0
package/dist/connection_pool.test.js.map +1 -0
package/dist/constants.cjs +30 -0
package/dist/constants.cjs.map +1 -1
package/dist/constants.d.cts +10 -0
package/dist/constants.d.ts +10 -0
package/dist/constants.d.ts.map +1 -1
package/dist/constants.js +20 -0
package/dist/constants.js.map +1 -1
package/dist/cpu.cjs +189 -0
package/dist/cpu.cjs.map +1 -0
package/dist/cpu.d.cts +24 -0
package/dist/cpu.d.ts +24 -0
package/dist/cpu.d.ts.map +1 -0
package/dist/cpu.js +152 -0
package/dist/cpu.js.map +1 -0
package/dist/cpu.test.cjs +227 -0
package/dist/cpu.test.cjs.map +1 -0
package/dist/cpu.test.js +204 -0
package/dist/cpu.test.js.map +1 -0
package/dist/http_server.cjs +9 -6
package/dist/http_server.cjs.map +1 -1
package/dist/http_server.d.cts +5 -1
package/dist/http_server.d.ts +5 -1
package/dist/http_server.d.ts.map +1 -1
package/dist/http_server.js +9 -6
package/dist/http_server.js.map +1 -1
package/dist/index.cjs +24 -9
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +15 -11
package/dist/index.d.ts +15 -11
package/dist/index.d.ts.map +1 -1
package/dist/index.js +18 -9
package/dist/index.js.map +1 -1
package/dist/inference/api_protos.cjs +70 -2
package/dist/inference/api_protos.cjs.map +1 -1
package/dist/inference/api_protos.d.cts +373 -32
package/dist/inference/api_protos.d.ts +373 -32
package/dist/inference/api_protos.d.ts.map +1 -1
package/dist/inference/api_protos.js +62 -2
package/dist/inference/api_protos.js.map +1 -1
package/dist/inference/index.cjs +8 -0
package/dist/inference/index.cjs.map +1 -1
package/dist/inference/index.d.cts +3 -4
package/dist/inference/index.d.ts +3 -4
package/dist/inference/index.d.ts.map +1 -1
package/dist/inference/index.js +18 -3
package/dist/inference/index.js.map +1 -1
package/dist/inference/interruption/defaults.cjs +81 -0
package/dist/inference/interruption/defaults.cjs.map +1 -0
package/dist/inference/interruption/defaults.d.cts +19 -0
package/dist/inference/interruption/defaults.d.ts +19 -0
package/dist/inference/interruption/defaults.d.ts.map +1 -0
package/dist/inference/interruption/defaults.js +46 -0
package/dist/inference/interruption/defaults.js.map +1 -0
package/dist/inference/interruption/errors.cjs +44 -0
package/dist/inference/interruption/errors.cjs.map +1 -0
package/dist/inference/interruption/errors.d.cts +12 -0
package/dist/inference/interruption/errors.d.ts +12 -0
package/dist/inference/interruption/errors.d.ts.map +1 -0
package/dist/inference/interruption/errors.js +20 -0
package/dist/inference/interruption/errors.js.map +1 -0
package/dist/inference/interruption/http_transport.cjs +163 -0
package/dist/inference/interruption/http_transport.cjs.map +1 -0
package/dist/inference/interruption/http_transport.d.cts +65 -0
package/dist/inference/interruption/http_transport.d.ts +65 -0
package/dist/inference/interruption/http_transport.d.ts.map +1 -0
package/dist/inference/interruption/http_transport.js +137 -0
package/dist/inference/interruption/http_transport.js.map +1 -0
package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
package/dist/inference/interruption/interruption_cache_entry.js +34 -0
package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
package/dist/inference/interruption/interruption_detector.cjs +198 -0
package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
package/dist/inference/interruption/interruption_detector.d.cts +59 -0
package/dist/inference/interruption/interruption_detector.d.ts +59 -0
package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
package/dist/inference/interruption/interruption_detector.js +164 -0
package/dist/inference/interruption/interruption_detector.js.map +1 -0
package/dist/inference/interruption/interruption_stream.cjs +368 -0
package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
package/dist/inference/interruption/interruption_stream.d.cts +46 -0
package/dist/inference/interruption/interruption_stream.d.ts +46 -0
package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
package/dist/inference/interruption/interruption_stream.js +344 -0
package/dist/inference/interruption/interruption_stream.js.map +1 -0
package/dist/inference/interruption/types.cjs +17 -0
package/dist/inference/interruption/types.cjs.map +1 -0
package/dist/inference/interruption/types.d.cts +66 -0
package/dist/inference/interruption/types.d.ts +66 -0
package/dist/inference/interruption/types.d.ts.map +1 -0
package/dist/inference/interruption/types.js +1 -0
package/dist/inference/interruption/types.js.map +1 -0
package/dist/inference/interruption/utils.cjs +130 -0
package/dist/inference/interruption/utils.cjs.map +1 -0
package/dist/inference/interruption/utils.d.cts +41 -0
package/dist/inference/interruption/utils.d.ts +41 -0
package/dist/inference/interruption/utils.d.ts.map +1 -0
package/dist/inference/interruption/utils.js +105 -0
package/dist/inference/interruption/utils.js.map +1 -0
package/dist/inference/interruption/utils.test.cjs +105 -0
package/dist/inference/interruption/utils.test.cjs.map +1 -0
package/dist/inference/interruption/utils.test.js +104 -0
package/dist/inference/interruption/utils.test.js.map +1 -0
package/dist/inference/interruption/ws_transport.cjs +347 -0
package/dist/inference/interruption/ws_transport.cjs.map +1 -0
package/dist/inference/interruption/ws_transport.d.cts +33 -0
package/dist/inference/interruption/ws_transport.d.ts +33 -0
package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
package/dist/inference/interruption/ws_transport.js +313 -0
package/dist/inference/interruption/ws_transport.js.map +1 -0
package/dist/inference/llm.cjs +106 -66
package/dist/inference/llm.cjs.map +1 -1
package/dist/inference/llm.d.cts +65 -43
package/dist/inference/llm.d.ts +65 -43
package/dist/inference/llm.d.ts.map +1 -1
package/dist/inference/llm.js +100 -66
package/dist/inference/llm.js.map +1 -1
package/dist/inference/stt.cjs +319 -170
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.cts +64 -15
package/dist/inference/stt.d.ts +64 -15
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +319 -170
package/dist/inference/stt.js.map +1 -1
package/dist/inference/stt.test.cjs +218 -0
package/dist/inference/stt.test.cjs.map +1 -0
package/dist/inference/stt.test.js +217 -0
package/dist/inference/stt.test.js.map +1 -0
package/dist/inference/tts.cjs +249 -71
package/dist/inference/tts.cjs.map +1 -1
package/dist/inference/tts.d.cts +94 -17
package/dist/inference/tts.d.ts +94 -17
package/dist/inference/tts.d.ts.map +1 -1
package/dist/inference/tts.js +249 -77
package/dist/inference/tts.js.map +1 -1
package/dist/inference/tts.test.cjs +305 -0
package/dist/inference/tts.test.cjs.map +1 -0
package/dist/inference/tts.test.js +304 -0
package/dist/inference/tts.test.js.map +1 -0
package/dist/inference/utils.cjs +26 -7
package/dist/inference/utils.cjs.map +1 -1
package/dist/inference/utils.d.cts +14 -1
package/dist/inference/utils.d.ts +14 -1
package/dist/inference/utils.d.ts.map +1 -1
package/dist/inference/utils.js +18 -2
package/dist/inference/utils.js.map +1 -1
package/dist/ipc/inference_proc_executor.cjs +6 -3
package/dist/ipc/inference_proc_executor.cjs.map +1 -1
package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
package/dist/ipc/inference_proc_executor.js +6 -3
package/dist/ipc/inference_proc_executor.js.map +1 -1
package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
package/dist/ipc/inference_proc_lazy_main.js +13 -1
package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
package/dist/ipc/job_proc_executor.cjs +6 -1
package/dist/ipc/job_proc_executor.cjs.map +1 -1
package/dist/ipc/job_proc_executor.d.ts.map +1 -1
package/dist/ipc/job_proc_executor.js +6 -1
package/dist/ipc/job_proc_executor.js.map +1 -1
package/dist/ipc/job_proc_lazy_main.cjs +89 -17
package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
package/dist/ipc/job_proc_lazy_main.js +68 -18
package/dist/ipc/job_proc_lazy_main.js.map +1 -1
package/dist/ipc/supervised_proc.cjs +34 -8
package/dist/ipc/supervised_proc.cjs.map +1 -1
package/dist/ipc/supervised_proc.d.cts +8 -0
package/dist/ipc/supervised_proc.d.ts +8 -0
package/dist/ipc/supervised_proc.d.ts.map +1 -1
package/dist/ipc/supervised_proc.js +34 -8
package/dist/ipc/supervised_proc.js.map +1 -1
package/dist/ipc/supervised_proc.test.cjs +145 -0
package/dist/ipc/supervised_proc.test.cjs.map +1 -0
package/dist/ipc/supervised_proc.test.js +122 -0
package/dist/ipc/supervised_proc.test.js.map +1 -0
package/dist/job.cjs +109 -1
package/dist/job.cjs.map +1 -1
package/dist/job.d.cts +14 -0
package/dist/job.d.ts +14 -0
package/dist/job.d.ts.map +1 -1
package/dist/job.js +99 -1
package/dist/job.js.map +1 -1
package/dist/language.cjs +394 -0
package/dist/language.cjs.map +1 -0
package/dist/language.d.cts +15 -0
package/dist/language.d.ts +15 -0
package/dist/language.d.ts.map +1 -0
package/dist/language.js +363 -0
package/dist/language.js.map +1 -0
package/dist/language.test.cjs +43 -0
package/dist/language.test.cjs.map +1 -0
package/dist/language.test.js +49 -0
package/dist/language.test.js.map +1 -0
package/dist/llm/chat_context.cjs +345 -3
package/dist/llm/chat_context.cjs.map +1 -1
package/dist/llm/chat_context.d.cts +86 -2
package/dist/llm/chat_context.d.ts +86 -2
package/dist/llm/chat_context.d.ts.map +1 -1
package/dist/llm/chat_context.js +344 -3
package/dist/llm/chat_context.js.map +1 -1
package/dist/llm/chat_context.test.cjs +692 -0
package/dist/llm/chat_context.test.cjs.map +1 -1
package/dist/llm/chat_context.test.js +692 -0
package/dist/llm/chat_context.test.js.map +1 -1
package/dist/llm/fallback_adapter.cjs +280 -0
package/dist/llm/fallback_adapter.cjs.map +1 -0
package/dist/llm/fallback_adapter.d.cts +73 -0
package/dist/llm/fallback_adapter.d.ts +73 -0
package/dist/llm/fallback_adapter.d.ts.map +1 -0
package/dist/llm/fallback_adapter.js +256 -0
package/dist/llm/fallback_adapter.js.map +1 -0
package/dist/llm/fallback_adapter.test.cjs +176 -0
package/dist/llm/fallback_adapter.test.cjs.map +1 -0
package/dist/llm/fallback_adapter.test.js +175 -0
package/dist/llm/fallback_adapter.test.js.map +1 -0
package/dist/llm/index.cjs +11 -0
package/dist/llm/index.cjs.map +1 -1
package/dist/llm/index.d.cts +4 -3
package/dist/llm/index.d.ts +4 -3
package/dist/llm/index.d.ts.map +1 -1
package/dist/llm/index.js +13 -1
package/dist/llm/index.js.map +1 -1
package/dist/llm/llm.cjs +65 -11
package/dist/llm/llm.cjs.map +1 -1
package/dist/llm/llm.d.cts +13 -2
package/dist/llm/llm.d.ts +13 -2
package/dist/llm/llm.d.ts.map +1 -1
package/dist/llm/llm.js +65 -11
package/dist/llm/llm.js.map +1 -1
package/dist/llm/provider_format/google.cjs +6 -2
package/dist/llm/provider_format/google.cjs.map +1 -1
package/dist/llm/provider_format/google.d.cts +1 -1
package/dist/llm/provider_format/google.d.ts +1 -1
package/dist/llm/provider_format/google.d.ts.map +1 -1
package/dist/llm/provider_format/google.js +6 -2
package/dist/llm/provider_format/google.js.map +1 -1
package/dist/llm/provider_format/google.test.cjs +48 -0
package/dist/llm/provider_format/google.test.cjs.map +1 -1
package/dist/llm/provider_format/google.test.js +54 -1
package/dist/llm/provider_format/google.test.js.map +1 -1
package/dist/llm/provider_format/index.cjs +2 -0
package/dist/llm/provider_format/index.cjs.map +1 -1
package/dist/llm/provider_format/index.d.cts +2 -2
package/dist/llm/provider_format/index.d.ts +2 -2
package/dist/llm/provider_format/index.d.ts.map +1 -1
package/dist/llm/provider_format/index.js +6 -1
package/dist/llm/provider_format/index.js.map +1 -1
package/dist/llm/provider_format/openai.cjs +126 -24
package/dist/llm/provider_format/openai.cjs.map +1 -1
package/dist/llm/provider_format/openai.d.cts +1 -0
package/dist/llm/provider_format/openai.d.ts +1 -0
package/dist/llm/provider_format/openai.d.ts.map +1 -1
package/dist/llm/provider_format/openai.js +124 -23
package/dist/llm/provider_format/openai.js.map +1 -1
package/dist/llm/provider_format/openai.test.cjs +393 -0
package/dist/llm/provider_format/openai.test.cjs.map +1 -1
package/dist/llm/provider_format/openai.test.js +400 -2
package/dist/llm/provider_format/openai.test.js.map +1 -1
package/dist/llm/provider_format/utils.cjs +5 -4
package/dist/llm/provider_format/utils.cjs.map +1 -1
package/dist/llm/provider_format/utils.d.ts.map +1 -1
package/dist/llm/provider_format/utils.js +5 -4
package/dist/llm/provider_format/utils.js.map +1 -1
package/dist/llm/realtime.cjs +3 -0
package/dist/llm/realtime.cjs.map +1 -1
package/dist/llm/realtime.d.cts +15 -1
package/dist/llm/realtime.d.ts +15 -1
package/dist/llm/realtime.d.ts.map +1 -1
package/dist/llm/realtime.js +3 -0
package/dist/llm/realtime.js.map +1 -1
package/dist/llm/remote_chat_context.cjs.map +1 -1
package/dist/llm/remote_chat_context.d.cts +2 -0
package/dist/llm/remote_chat_context.d.ts +2 -0
package/dist/llm/remote_chat_context.d.ts.map +1 -1
package/dist/llm/remote_chat_context.js.map +1 -1
package/dist/llm/tool_context.cjs +50 -2
package/dist/llm/tool_context.cjs.map +1 -1
package/dist/llm/tool_context.d.cts +47 -11
package/dist/llm/tool_context.d.ts +47 -11
package/dist/llm/tool_context.d.ts.map +1 -1
package/dist/llm/tool_context.js +48 -3
package/dist/llm/tool_context.js.map +1 -1
package/dist/llm/tool_context.test.cjs +197 -0
package/dist/llm/tool_context.test.cjs.map +1 -1
package/dist/llm/tool_context.test.js +175 -0
package/dist/llm/tool_context.test.js.map +1 -1
package/dist/llm/utils.cjs +107 -12
package/dist/llm/utils.cjs.map +1 -1
package/dist/llm/utils.d.cts +10 -3
package/dist/llm/utils.d.ts +10 -3
package/dist/llm/utils.d.ts.map +1 -1
package/dist/llm/utils.js +106 -12
package/dist/llm/utils.js.map +1 -1
package/dist/llm/utils.test.cjs +90 -0
package/dist/llm/utils.test.cjs.map +1 -1
package/dist/llm/utils.test.js +98 -2
package/dist/llm/utils.test.js.map +1 -1
package/dist/llm/zod-utils.cjs +102 -0
package/dist/llm/zod-utils.cjs.map +1 -0
package/dist/llm/zod-utils.d.cts +65 -0
package/dist/llm/zod-utils.d.ts +65 -0
package/dist/llm/zod-utils.d.ts.map +1 -0
package/dist/llm/zod-utils.js +64 -0
package/dist/llm/zod-utils.js.map +1 -0
package/dist/llm/zod-utils.test.cjs +472 -0
package/dist/llm/zod-utils.test.cjs.map +1 -0
package/dist/llm/zod-utils.test.js +455 -0
package/dist/llm/zod-utils.test.js.map +1 -0
package/dist/log.cjs +45 -14
package/dist/log.cjs.map +1 -1
package/dist/log.d.cts +8 -1
package/dist/log.d.ts +8 -1
package/dist/log.d.ts.map +1 -1
package/dist/log.js +45 -15
package/dist/log.js.map +1 -1
package/dist/metrics/base.cjs.map +1 -1
package/dist/metrics/base.d.cts +75 -19
package/dist/metrics/base.d.ts +75 -19
package/dist/metrics/base.d.ts.map +1 -1
package/dist/metrics/index.cjs +5 -0
package/dist/metrics/index.cjs.map +1 -1
package/dist/metrics/index.d.cts +2 -1
package/dist/metrics/index.d.ts +2 -1
package/dist/metrics/index.d.ts.map +1 -1
package/dist/metrics/index.js +6 -0
package/dist/metrics/index.js.map +1 -1
package/dist/metrics/model_usage.cjs +189 -0
package/dist/metrics/model_usage.cjs.map +1 -0
package/dist/metrics/model_usage.d.cts +92 -0
package/dist/metrics/model_usage.d.ts +92 -0
package/dist/metrics/model_usage.d.ts.map +1 -0
package/dist/metrics/model_usage.js +164 -0
package/dist/metrics/model_usage.js.map +1 -0
package/dist/metrics/model_usage.test.cjs +474 -0
package/dist/metrics/model_usage.test.cjs.map +1 -0
package/dist/metrics/model_usage.test.js +476 -0
package/dist/metrics/model_usage.test.js.map +1 -0
package/dist/metrics/usage_collector.cjs +5 -2
package/dist/metrics/usage_collector.cjs.map +1 -1
package/dist/metrics/usage_collector.d.cts +10 -1
package/dist/metrics/usage_collector.d.ts +10 -1
package/dist/metrics/usage_collector.d.ts.map +1 -1
package/dist/metrics/usage_collector.js +5 -2
package/dist/metrics/usage_collector.js.map +1 -1
package/dist/metrics/utils.cjs +23 -7
package/dist/metrics/utils.cjs.map +1 -1
package/dist/metrics/utils.d.ts.map +1 -1
package/dist/metrics/utils.js +23 -7
package/dist/metrics/utils.js.map +1 -1
package/dist/stream/deferred_stream.cjs +31 -10
package/dist/stream/deferred_stream.cjs.map +1 -1
package/dist/stream/deferred_stream.d.cts +6 -1
package/dist/stream/deferred_stream.d.ts +6 -1
package/dist/stream/deferred_stream.d.ts.map +1 -1
package/dist/stream/deferred_stream.js +31 -10
package/dist/stream/deferred_stream.js.map +1 -1
package/dist/stream/deferred_stream.test.cjs +2 -2
package/dist/stream/deferred_stream.test.cjs.map +1 -1
package/dist/stream/deferred_stream.test.js +2 -2
package/dist/stream/deferred_stream.test.js.map +1 -1
package/dist/stream/index.cjs +3 -0
package/dist/stream/index.cjs.map +1 -1
package/dist/stream/index.d.cts +1 -0
package/dist/stream/index.d.ts +1 -0
package/dist/stream/index.d.ts.map +1 -1
package/dist/stream/index.js +2 -0
package/dist/stream/index.js.map +1 -1
package/dist/stream/multi_input_stream.cjs +139 -0
package/dist/stream/multi_input_stream.cjs.map +1 -0
package/dist/stream/multi_input_stream.d.cts +55 -0
package/dist/stream/multi_input_stream.d.ts +55 -0
package/dist/stream/multi_input_stream.d.ts.map +1 -0
package/dist/stream/multi_input_stream.js +115 -0
package/dist/stream/multi_input_stream.js.map +1 -0
package/dist/stream/multi_input_stream.test.cjs +344 -0
package/dist/stream/multi_input_stream.test.cjs.map +1 -0
package/dist/stream/multi_input_stream.test.js +343 -0
package/dist/stream/multi_input_stream.test.js.map +1 -0
package/dist/stream/stream_channel.cjs +39 -1
package/dist/stream/stream_channel.cjs.map +1 -1
package/dist/stream/stream_channel.d.cts +5 -2
package/dist/stream/stream_channel.d.ts +5 -2
package/dist/stream/stream_channel.d.ts.map +1 -1
package/dist/stream/stream_channel.js +39 -1
package/dist/stream/stream_channel.js.map +1 -1
package/dist/stream/stream_channel.test.cjs +27 -0
package/dist/stream/stream_channel.test.cjs.map +1 -1
package/dist/stream/stream_channel.test.js +27 -0
package/dist/stream/stream_channel.test.js.map +1 -1
package/dist/stt/stream_adapter.cjs +24 -9
package/dist/stt/stream_adapter.cjs.map +1 -1
package/dist/stt/stream_adapter.d.cts +7 -3
package/dist/stt/stream_adapter.d.ts +7 -3
package/dist/stt/stream_adapter.d.ts.map +1 -1
package/dist/stt/stream_adapter.js +24 -9
package/dist/stt/stream_adapter.js.map +1 -1
package/dist/stt/stt.cjs +94 -19
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +68 -5
package/dist/stt/stt.d.ts +68 -5
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +96 -21
package/dist/stt/stt.js.map +1 -1
package/dist/telemetry/index.cjs +72 -0
package/dist/telemetry/index.cjs.map +1 -0
package/dist/telemetry/index.d.cts +7 -0
package/dist/telemetry/index.d.ts +7 -0
package/dist/telemetry/index.d.ts.map +1 -0
package/dist/telemetry/index.js +37 -0
package/dist/telemetry/index.js.map +1 -0
package/dist/telemetry/logging.cjs +65 -0
package/dist/telemetry/logging.cjs.map +1 -0
package/dist/telemetry/logging.d.cts +21 -0
package/dist/telemetry/logging.d.ts +21 -0
package/dist/telemetry/logging.d.ts.map +1 -0
package/dist/telemetry/logging.js +40 -0
package/dist/telemetry/logging.js.map +1 -0
package/dist/telemetry/otel_http_exporter.cjs +166 -0
package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
package/dist/telemetry/otel_http_exporter.d.cts +63 -0
package/dist/telemetry/otel_http_exporter.d.ts +63 -0
package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
package/dist/telemetry/otel_http_exporter.js +142 -0
package/dist/telemetry/otel_http_exporter.js.map +1 -0
package/dist/telemetry/pino_otel_transport.cjs +217 -0
package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
package/dist/telemetry/pino_otel_transport.d.cts +58 -0
package/dist/telemetry/pino_otel_transport.d.ts +58 -0
package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
package/dist/telemetry/pino_otel_transport.js +189 -0
package/dist/telemetry/pino_otel_transport.js.map +1 -0
package/dist/telemetry/trace_types.cjs +233 -0
package/dist/telemetry/trace_types.cjs.map +1 -0
package/dist/telemetry/trace_types.d.cts +74 -0
package/dist/telemetry/trace_types.d.ts +74 -0
package/dist/telemetry/trace_types.d.ts.map +1 -0
package/dist/telemetry/trace_types.js +141 -0
package/dist/telemetry/trace_types.js.map +1 -0
package/dist/telemetry/traces.cjs +484 -0
package/dist/telemetry/traces.cjs.map +1 -0
package/dist/telemetry/traces.d.cts +116 -0
package/dist/telemetry/traces.d.ts +116 -0
package/dist/telemetry/traces.d.ts.map +1 -0
package/dist/telemetry/traces.js +449 -0
package/dist/telemetry/traces.js.map +1 -0
package/dist/telemetry/utils.cjs +86 -0
package/dist/telemetry/utils.cjs.map +1 -0
package/dist/telemetry/utils.d.cts +5 -0
package/dist/telemetry/utils.d.ts +5 -0
package/dist/telemetry/utils.d.ts.map +1 -0
package/dist/telemetry/utils.js +51 -0
package/dist/telemetry/utils.js.map +1 -0
package/dist/tokenize/basic/sentence.cjs +3 -3
package/dist/tokenize/basic/sentence.cjs.map +1 -1
package/dist/tokenize/basic/sentence.js +3 -3
package/dist/tokenize/basic/sentence.js.map +1 -1
package/dist/tokenize/tokenizer.test.cjs +3 -1
package/dist/tokenize/tokenizer.test.cjs.map +1 -1
package/dist/tokenize/tokenizer.test.js +3 -1
package/dist/tokenize/tokenizer.test.js.map +1 -1
package/dist/transcription.cjs.map +1 -1
package/dist/transcription.d.cts +6 -0
package/dist/transcription.d.ts +6 -0
package/dist/transcription.d.ts.map +1 -1
package/dist/transcription.js.map +1 -1
package/dist/tts/fallback_adapter.cjs +472 -0
package/dist/tts/fallback_adapter.cjs.map +1 -0
package/dist/tts/fallback_adapter.d.cts +110 -0
package/dist/tts/fallback_adapter.d.ts +110 -0
package/dist/tts/fallback_adapter.d.ts.map +1 -0
package/dist/tts/fallback_adapter.js +448 -0
package/dist/tts/fallback_adapter.js.map +1 -0
package/dist/tts/index.cjs +3 -0
package/dist/tts/index.cjs.map +1 -1
package/dist/tts/index.d.cts +1 -0
package/dist/tts/index.d.ts +1 -0
package/dist/tts/index.d.ts.map +1 -1
package/dist/tts/index.js +2 -0
package/dist/tts/index.js.map +1 -1
package/dist/tts/stream_adapter.cjs +25 -8
package/dist/tts/stream_adapter.cjs.map +1 -1
package/dist/tts/stream_adapter.d.cts +6 -3
package/dist/tts/stream_adapter.d.ts +6 -3
package/dist/tts/stream_adapter.d.ts.map +1 -1
package/dist/tts/stream_adapter.js +25 -8
package/dist/tts/stream_adapter.js.map +1 -1
package/dist/tts/tts.cjs +189 -57
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.d.cts +58 -6
package/dist/tts/tts.d.ts +58 -6
package/dist/tts/tts.d.ts.map +1 -1
package/dist/tts/tts.js +191 -59
package/dist/tts/tts.js.map +1 -1
package/dist/typed_promise.cjs +48 -0
package/dist/typed_promise.cjs.map +1 -0
package/dist/typed_promise.d.cts +24 -0
package/dist/typed_promise.d.ts +24 -0
package/dist/typed_promise.d.ts.map +1 -0
package/dist/typed_promise.js +28 -0
package/dist/typed_promise.js.map +1 -0
package/dist/types.cjs +24 -32
package/dist/types.cjs.map +1 -1
package/dist/types.d.cts +45 -10
package/dist/types.d.ts +45 -10
package/dist/types.d.ts.map +1 -1
package/dist/types.js +20 -30
package/dist/types.js.map +1 -1
package/dist/utils.cjs +124 -28
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +41 -1
package/dist/utils.d.ts +41 -1
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +119 -27
package/dist/utils.js.map +1 -1
package/dist/utils.test.cjs +73 -1
package/dist/utils.test.cjs.map +1 -1
package/dist/utils.test.js +74 -10
package/dist/utils.test.js.map +1 -1
package/dist/vad.cjs +35 -15
package/dist/vad.cjs.map +1 -1
package/dist/vad.d.cts +15 -5
package/dist/vad.d.ts +15 -5
package/dist/vad.d.ts.map +1 -1
package/dist/vad.js +35 -15
package/dist/vad.js.map +1 -1
package/dist/version.cjs +1 -1
package/dist/version.cjs.map +1 -1
package/dist/version.d.cts +1 -1
package/dist/version.d.ts +1 -1
package/dist/version.d.ts.map +1 -1
package/dist/version.js +1 -1
package/dist/version.js.map +1 -1
package/dist/voice/agent.cjs +258 -35
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.cts +54 -13
package/dist/voice/agent.d.ts +54 -13
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +254 -34
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent.test.cjs +314 -0
package/dist/voice/agent.test.cjs.map +1 -1
package/dist/voice/agent.test.js +316 -2
package/dist/voice/agent.test.js.map +1 -1
package/dist/voice/agent_activity.cjs +1116 -385
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +72 -11
package/dist/voice/agent_activity.d.ts +72 -11
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +1119 -383
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_activity.test.cjs +135 -0
package/dist/voice/agent_activity.test.cjs.map +1 -0
package/dist/voice/agent_activity.test.js +134 -0
package/dist/voice/agent_activity.test.js.map +1 -0
package/dist/voice/agent_session.cjs +550 -90
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +185 -25
package/dist/voice/agent_session.d.ts +185 -25
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +556 -91
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +605 -46
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.cts +96 -4
package/dist/voice/audio_recognition.d.ts +96 -4
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +611 -47
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/audio_recognition_span.test.cjs +295 -0
package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
package/dist/voice/audio_recognition_span.test.js +299 -0
package/dist/voice/audio_recognition_span.test.js.map +1 -0
package/dist/voice/avatar/datastream_io.cjs +7 -1
package/dist/voice/avatar/datastream_io.cjs.map +1 -1
package/dist/voice/avatar/datastream_io.d.cts +1 -0
package/dist/voice/avatar/datastream_io.d.ts +1 -0
package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
package/dist/voice/avatar/datastream_io.js +7 -1
package/dist/voice/avatar/datastream_io.js.map +1 -1
package/dist/voice/background_audio.cjs +367 -0
package/dist/voice/background_audio.cjs.map +1 -0
package/dist/voice/background_audio.d.cts +123 -0
package/dist/voice/background_audio.d.ts +123 -0
package/dist/voice/background_audio.d.ts.map +1 -0
package/dist/voice/background_audio.js +343 -0
package/dist/voice/background_audio.js.map +1 -0
package/dist/voice/events.cjs +3 -0
package/dist/voice/events.cjs.map +1 -1
package/dist/voice/events.d.cts +16 -9
package/dist/voice/events.d.ts +16 -9
package/dist/voice/events.d.ts.map +1 -1
package/dist/voice/events.js +3 -0
package/dist/voice/events.js.map +1 -1
package/dist/voice/generation.cjs +205 -41
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.cts +21 -5
package/dist/voice/generation.d.ts +21 -5
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +215 -43
package/dist/voice/generation.js.map +1 -1
package/dist/voice/generation_tools.test.cjs +236 -0
package/dist/voice/generation_tools.test.cjs.map +1 -0
package/dist/voice/generation_tools.test.js +235 -0
package/dist/voice/generation_tools.test.js.map +1 -0
package/dist/voice/index.cjs +33 -2
package/dist/voice/index.cjs.map +1 -1
package/dist/voice/index.d.cts +8 -2
package/dist/voice/index.d.ts +8 -2
package/dist/voice/index.d.ts.map +1 -1
package/dist/voice/index.js +19 -2
package/dist/voice/index.js.map +1 -1
package/dist/voice/interruption_detection.test.cjs +114 -0
package/dist/voice/interruption_detection.test.cjs.map +1 -0
package/dist/voice/interruption_detection.test.js +113 -0
package/dist/voice/interruption_detection.test.js.map +1 -0
package/dist/voice/io.cjs +66 -6
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +67 -7
package/dist/voice/io.d.ts +67 -7
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +62 -5
package/dist/voice/io.js.map +1 -1
package/dist/voice/recorder_io/index.cjs +23 -0
package/dist/voice/recorder_io/index.cjs.map +1 -0
package/dist/voice/recorder_io/index.d.cts +2 -0
package/dist/voice/recorder_io/index.d.ts +2 -0
package/dist/voice/recorder_io/index.d.ts.map +1 -0
package/dist/voice/recorder_io/index.js +2 -0
package/dist/voice/recorder_io/index.js.map +1 -0
package/dist/voice/recorder_io/recorder_io.cjs +607 -0
package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
package/dist/voice/recorder_io/recorder_io.js +573 -0
package/dist/voice/recorder_io/recorder_io.js.map +1 -0
package/dist/voice/remote_session.cjs +922 -0
package/dist/voice/remote_session.cjs.map +1 -0
package/dist/voice/remote_session.d.cts +108 -0
package/dist/voice/remote_session.d.ts +108 -0
package/dist/voice/remote_session.d.ts.map +1 -0
package/dist/voice/remote_session.js +887 -0
package/dist/voice/remote_session.js.map +1 -0
package/dist/voice/report.cjs +88 -0
package/dist/voice/report.cjs.map +1 -0
package/dist/voice/report.d.cts +49 -0
package/dist/voice/report.d.ts +49 -0
package/dist/voice/report.d.ts.map +1 -0
package/dist/voice/report.js +63 -0
package/dist/voice/report.js.map +1 -0
package/dist/voice/report.test.cjs +121 -0
package/dist/voice/report.test.cjs.map +1 -0
package/dist/voice/report.test.js +120 -0
package/dist/voice/report.test.js.map +1 -0
package/dist/voice/room_io/_input.cjs +40 -7
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.cts +5 -2
package/dist/voice/room_io/_input.d.ts +5 -2
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +41 -8
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/room_io/_output.cjs +19 -11
package/dist/voice/room_io/_output.cjs.map +1 -1
package/dist/voice/room_io/_output.d.cts +7 -4
package/dist/voice/room_io/_output.d.ts +7 -4
package/dist/voice/room_io/_output.d.ts.map +1 -1
package/dist/voice/room_io/_output.js +20 -12
package/dist/voice/room_io/_output.js.map +1 -1
package/dist/voice/room_io/room_io.cjs +33 -6
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +29 -9
package/dist/voice/room_io/room_io.d.ts +29 -9
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js +33 -7
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/speech_handle.cjs +22 -4
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +17 -2
package/dist/voice/speech_handle.d.ts +17 -2
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +21 -4
package/dist/voice/speech_handle.js.map +1 -1
package/dist/voice/testing/fake_llm.cjs +127 -0
package/dist/voice/testing/fake_llm.cjs.map +1 -0
package/dist/voice/testing/fake_llm.d.cts +30 -0
package/dist/voice/testing/fake_llm.d.ts +30 -0
package/dist/voice/testing/fake_llm.d.ts.map +1 -0
package/dist/voice/testing/fake_llm.js +103 -0
package/dist/voice/testing/fake_llm.js.map +1 -0
package/dist/voice/testing/index.cjs +57 -0
package/dist/voice/testing/index.cjs.map +1 -0
package/dist/voice/testing/index.d.cts +21 -0
package/dist/voice/testing/index.d.ts +21 -0
package/dist/voice/testing/index.d.ts.map +1 -0
package/dist/voice/testing/index.js +35 -0
package/dist/voice/testing/index.js.map +1 -0
package/dist/voice/testing/run_result.cjs +817 -0
package/dist/voice/testing/run_result.cjs.map +1 -0
package/dist/voice/testing/run_result.d.cts +385 -0
package/dist/voice/testing/run_result.d.ts +385 -0
package/dist/voice/testing/run_result.d.ts.map +1 -0
package/dist/voice/testing/run_result.js +790 -0
package/dist/voice/testing/run_result.js.map +1 -0
package/dist/voice/testing/types.cjs +46 -0
package/dist/voice/testing/types.cjs.map +1 -0
package/dist/voice/testing/types.d.cts +83 -0
package/dist/voice/testing/types.d.ts +83 -0
package/dist/voice/testing/types.d.ts.map +1 -0
package/dist/voice/testing/types.js +19 -0
package/dist/voice/testing/types.js.map +1 -0
package/dist/voice/transcription/synchronizer.cjs +139 -15
package/dist/voice/transcription/synchronizer.cjs.map +1 -1
package/dist/voice/transcription/synchronizer.d.cts +35 -4
package/dist/voice/transcription/synchronizer.d.ts +35 -4
package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
package/dist/voice/transcription/synchronizer.js +143 -16
package/dist/voice/transcription/synchronizer.js.map +1 -1
package/dist/voice/transcription/synchronizer.test.cjs +151 -0
package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
package/dist/voice/transcription/synchronizer.test.js +150 -0
package/dist/voice/transcription/synchronizer.test.js.map +1 -0
package/dist/voice/turn_config/endpointing.cjs +33 -0
package/dist/voice/turn_config/endpointing.cjs.map +1 -0
package/dist/voice/turn_config/endpointing.d.cts +30 -0
package/dist/voice/turn_config/endpointing.d.ts +30 -0
package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
package/dist/voice/turn_config/endpointing.js +9 -0
package/dist/voice/turn_config/endpointing.js.map +1 -0
package/dist/voice/turn_config/interruption.cjs +37 -0
package/dist/voice/turn_config/interruption.cjs.map +1 -0
package/dist/voice/turn_config/interruption.d.cts +53 -0
package/dist/voice/turn_config/interruption.d.ts +53 -0
package/dist/voice/turn_config/interruption.d.ts.map +1 -0
package/dist/voice/turn_config/interruption.js +13 -0
package/dist/voice/turn_config/interruption.js.map +1 -0
package/dist/voice/turn_config/turn_handling.cjs +35 -0
package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
package/dist/voice/turn_config/turn_handling.d.cts +36 -0
package/dist/voice/turn_config/turn_handling.d.ts +36 -0
package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
package/dist/voice/turn_config/turn_handling.js +11 -0
package/dist/voice/turn_config/turn_handling.js.map +1 -0
package/dist/voice/turn_config/utils.cjs +157 -0
package/dist/voice/turn_config/utils.cjs.map +1 -0
package/dist/voice/turn_config/utils.d.cts +37 -0
package/dist/voice/turn_config/utils.d.ts +37 -0
package/dist/voice/turn_config/utils.d.ts.map +1 -0
package/dist/voice/turn_config/utils.js +131 -0
package/dist/voice/turn_config/utils.js.map +1 -0
package/dist/voice/turn_config/utils.test.cjs +128 -0
package/dist/voice/turn_config/utils.test.cjs.map +1 -0
package/dist/voice/turn_config/utils.test.js +127 -0
package/dist/voice/turn_config/utils.test.js.map +1 -0
package/dist/voice/utils.cjs +47 -0
package/dist/voice/utils.cjs.map +1 -0
package/dist/voice/utils.d.cts +4 -0
package/dist/voice/utils.d.ts +4 -0
package/dist/voice/utils.d.ts.map +1 -0
package/dist/voice/utils.js +23 -0
package/dist/voice/utils.js.map +1 -0
package/dist/worker.cjs +44 -52
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.cts +18 -8
package/dist/worker.d.ts +18 -8
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +43 -43
package/dist/worker.js.map +1 -1
package/package.json +35 -13
package/resources/NOTICE +2 -0
package/resources/keyboard-typing.ogg +0 -0
package/resources/keyboard-typing2.ogg +0 -0
package/resources/office-ambience.ogg +0 -0
package/src/_exceptions.ts +5 -0
package/src/audio.ts +132 -1
package/src/beta/index.ts +9 -0
package/src/beta/workflows/index.ts +9 -0
package/src/beta/workflows/task_group.ts +203 -0
package/src/cli.ts +57 -66
package/src/connection_pool.test.ts +346 -0
package/src/connection_pool.ts +307 -0
package/src/constants.ts +14 -0
package/src/cpu.test.ts +239 -0
package/src/cpu.ts +173 -0
package/src/http_server.ts +18 -6
package/src/index.ts +15 -13
package/src/inference/api_protos.ts +85 -2
package/src/inference/index.ts +32 -4
package/src/inference/interruption/defaults.ts +51 -0
package/src/inference/interruption/errors.ts +25 -0
package/src/inference/interruption/http_transport.ts +207 -0
package/src/inference/interruption/interruption_cache_entry.ts +50 -0
package/src/inference/interruption/interruption_detector.ts +204 -0
package/src/inference/interruption/interruption_stream.ts +467 -0
package/src/inference/interruption/types.ts +84 -0
package/src/inference/interruption/utils.test.ts +132 -0
package/src/inference/interruption/utils.ts +137 -0
package/src/inference/interruption/ws_transport.ts +416 -0
package/src/inference/llm.ts +214 -163
package/src/inference/stt.test.ts +253 -0
package/src/inference/stt.ts +449 -208
package/src/inference/tts.test.ts +354 -0
package/src/inference/tts.ts +417 -115
package/src/inference/utils.ts +30 -2
package/src/ipc/inference_proc_executor.ts +11 -3
package/src/ipc/inference_proc_lazy_main.ts +13 -1
package/src/ipc/job_proc_executor.ts +11 -1
package/src/ipc/job_proc_lazy_main.ts +86 -20
package/src/ipc/supervised_proc.test.ts +153 -0
package/src/ipc/supervised_proc.ts +39 -10
package/src/job.ts +120 -1
package/src/language.test.ts +62 -0
package/src/language.ts +380 -0
package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
package/src/llm/chat_context.test.ts +787 -0
package/src/llm/chat_context.ts +493 -2
package/src/llm/fallback_adapter.test.ts +238 -0
package/src/llm/fallback_adapter.ts +394 -0
package/src/llm/index.ts +13 -0
package/src/llm/llm.ts +77 -12
package/src/llm/provider_format/google.test.ts +72 -1
package/src/llm/provider_format/google.ts +10 -6
package/src/llm/provider_format/index.ts +7 -2
package/src/llm/provider_format/openai.test.ts +480 -2
package/src/llm/provider_format/openai.ts +152 -21
package/src/llm/provider_format/utils.ts +11 -5
package/src/llm/realtime.ts +23 -2
package/src/llm/remote_chat_context.ts +2 -2
package/src/llm/tool_context.test.ts +210 -1
package/src/llm/tool_context.ts +115 -17
package/src/llm/utils.test.ts +103 -2
package/src/llm/utils.ts +152 -16
package/src/llm/zod-utils.test.ts +577 -0
package/src/llm/zod-utils.ts +153 -0
package/src/log.ts +71 -19
package/src/metrics/base.ts +78 -19
package/src/metrics/index.ts +12 -0
package/src/metrics/model_usage.test.ts +545 -0
package/src/metrics/model_usage.ts +262 -0
package/src/metrics/usage_collector.ts +14 -3
package/src/metrics/utils.ts +27 -7
package/src/stream/deferred_stream.test.ts +3 -3
package/src/stream/deferred_stream.ts +43 -11
package/src/stream/index.ts +1 -0
package/src/stream/multi_input_stream.test.ts +545 -0
package/src/stream/multi_input_stream.ts +172 -0
package/src/stream/stream_channel.test.ts +37 -0
package/src/stream/stream_channel.ts +43 -3
package/src/stt/stream_adapter.ts +30 -9
package/src/stt/stt.ts +140 -23
package/src/telemetry/index.ts +28 -0
package/src/telemetry/logging.ts +55 -0
package/src/telemetry/otel_http_exporter.ts +218 -0
package/src/telemetry/pino_otel_transport.ts +265 -0
package/src/telemetry/trace_types.ts +109 -0
package/src/telemetry/traces.ts +673 -0
package/src/telemetry/utils.ts +61 -0
package/src/tokenize/basic/sentence.ts +3 -3
package/src/tokenize/tokenizer.test.ts +4 -0
package/src/transcription.ts +6 -0
package/src/tts/fallback_adapter.ts +586 -0
package/src/tts/index.ts +1 -0
package/src/tts/stream_adapter.ts +38 -8
package/src/tts/tts.ts +245 -62
package/src/typed_promise.ts +67 -0
package/src/types.ts +62 -33
package/src/utils.test.ts +90 -10
package/src/utils.ts +178 -33
package/src/vad.ts +42 -18
package/src/version.ts +1 -1
package/src/voice/agent.test.ts +347 -2
package/src/voice/agent.ts +346 -44
package/src/voice/agent_activity.test.ts +194 -0
package/src/voice/agent_activity.ts +1457 -388
package/src/voice/agent_session.ts +817 -112
package/src/voice/audio_recognition.ts +845 -70
package/src/voice/audio_recognition_span.test.ts +341 -0
package/src/voice/avatar/datastream_io.ts +9 -1
package/src/voice/background_audio.ts +494 -0
package/src/voice/events.ts +27 -7
package/src/voice/generation.ts +310 -56
package/src/voice/generation_tools.test.ts +268 -0
package/src/voice/index.ts +17 -3
package/src/voice/interruption_detection.test.ts +151 -0
package/src/voice/io.ts +115 -12
package/src/voice/recorder_io/index.ts +4 -0
package/src/voice/recorder_io/recorder_io.ts +783 -0
package/src/voice/remote_session.ts +1083 -0
package/src/voice/report.test.ts +136 -0
package/src/voice/report.ts +140 -0
package/src/voice/room_io/_input.ts +45 -10
package/src/voice/room_io/_output.ts +26 -14
package/src/voice/room_io/room_io.ts +67 -22
package/src/voice/speech_handle.ts +38 -6
package/src/voice/testing/fake_llm.ts +138 -0
package/src/voice/testing/index.ts +52 -0
package/src/voice/testing/run_result.ts +995 -0
package/src/voice/testing/types.ts +118 -0
package/src/voice/transcription/synchronizer.test.ts +206 -0
package/src/voice/transcription/synchronizer.ts +204 -19
package/src/voice/turn_config/endpointing.ts +33 -0
package/src/voice/turn_config/interruption.ts +56 -0
package/src/voice/turn_config/turn_handling.ts +45 -0
package/src/voice/turn_config/utils.test.ts +148 -0
package/src/voice/turn_config/utils.ts +167 -0
package/src/voice/utils.ts +29 -0
package/src/worker.ts +92 -78
package/src/llm/__snapshots__/utils.test.ts.snap +0 -65

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -3,13 +3,19 @@
 // SPDX-License-Identifier: Apache-2.0
 import { Mutex } from '@livekit/mutex';
 import type { AudioFrame } from '@livekit/rtc-node';
+import type { Span } from '@opentelemetry/api';
+import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
 import { Heap } from 'heap-js';
 import { AsyncLocalStorage } from 'node:async_hooks';
-import { ReadableStream } from 'node:stream/web';
-import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
+import { ReadableStream, TransformStream } from 'node:stream/web';
+import type { InterruptionDetectionError } from '../inference/interruption/errors.js';
+import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
+import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
+import { type ChatContext, ChatMessage, type MetricsReport } from '../llm/chat_context.js';
 import {
   type ChatItem,
   type FunctionCall,
+  type FunctionCallOutput,
   type GenerationCreatedEvent,
   type InputSpeechStartedEvent,
   type InputSpeechStoppedEvent,
@@ -20,31 +26,41 @@ import {
   type RealtimeSession,
   type ToolChoice,
   type ToolContext,
+  ToolFlag,
 } from '../llm/index.js';
 import type { LLMError } from '../llm/llm.js';
+import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
 import { log } from '../log.js';
 import type {
   EOUMetrics,
+  InterruptionMetrics,
   LLMMetrics,
   RealtimeModelMetrics,
   STTMetrics,
   TTSMetrics,
   VADMetrics,
 } from '../metrics/base.js';
-import { DeferredReadableStream } from '../stream/deferred_stream.js';
+import { MultiInputStream } from '../stream/multi_input_stream.js';
 import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
+import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
 import { splitWords } from '../tokenize/basic/word.js';
 import { TTS, type TTSError } from '../tts/tts.js';
-import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
+import { Future, Task, cancelAndWait, isDevMode, isHosted, waitFor } from '../utils.js';
 import { VAD, type VADEvent } from '../vad.js';
 import type { Agent, ModelSettings } from './agent.js';
-import { StopResponse, asyncLocalStorage } from './agent.js';
+import {
+  StopResponse,
+  _getActivityTaskInfo,
+  _setActivityTaskInfo,
+  functionCallStorage,
+  speechHandleStorage,
+} from './agent.js';
 import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
 import {
   AudioRecognition,
   type EndOfTurnInfo,
+  type PreemptiveGenerationInfo,
   type RecognitionHooks,
-  type _TurnDetector,
 } from './audio_recognition.js';
 import {
   AgentSessionEventTypes,
@@ -54,7 +70,7 @@ import {
   createSpeechCreatedEvent,
   createUserInputTranscribedEvent,
 } from './events.js';
-import type { ToolExecutionOutput } from './generation.js';
+import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
 import {
   type _AudioOut,
   type _TextOut,
@@ -66,34 +82,105 @@ import {
   removeInstructions,
   updateInstructions,
 } from './generation.js';
+import type { TimedString } from './io.js';
 import { SpeechHandle } from './speech_handle.js';
+import { setParticipantSpanAttributes } from './utils.js';
+export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
+export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
+interface OnEnterData {
+  session: AgentSession;
+  agent: Agent;
+}
-// equivalent to Python's contextvars
-const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
+interface PreemptiveGeneration {
+  speechHandle: SpeechHandle;
+  userMessage: ChatMessage;
+  info: PreemptiveGenerationInfo;
+  chatCtx: ChatContext;
+  tools: ToolContext;
+  toolChoice: ToolChoice | null;
+  createdAt: number;
+}
+// TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes
 export class AgentActivity implements RecognitionHooks {
+  agent: Agent;
+  agentSession: AgentSession;
   private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
   private started = false;
   private audioRecognition?: AudioRecognition;
   private realtimeSession?: RealtimeSession;
-  private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
+  private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
+  private turnDetectionMode?: TurnDetectionMode;
   private logger = log();
-  private _draining = false;
+  private _schedulingPaused = true;
+  private _drainBlockedTasks: Task<any>[] = [];
   private _currentSpeech?: SpeechHandle;
   private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
   private q_updated: Future;
   private speechTasks: Set<Task<void>> = new Set();
   private lock = new Mutex();
-  private audioStream = new DeferredReadableStream<AudioFrame>();
+  private audioStream = new MultiInputStream<AudioFrame>();
+  private audioStreamId?: string;
   // default to null as None, which maps to the default provider tool choice value
   private toolChoice: ToolChoice | null = null;
+  private _preemptiveGeneration?: PreemptiveGeneration;
+  private interruptionDetector?: AdaptiveInterruptionDetector;
+  private isInterruptionDetectionEnabled: boolean;
+  private isInterruptionByAudioActivityEnabled: boolean;
+  private isDefaultInterruptionByAudioActivityEnabled: boolean;
-  agent: Agent;
-  agentSession: AgentSession;
+  private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent): void =>
+    this.onGenerationCreated(ev);
+  private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent): void =>
+    this.onInputSpeechStarted(ev);
+  private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent): void =>
+    this.onInputSpeechStopped(ev);
+  private readonly onRealtimeInputAudioTranscriptionCompleted = (
+    ev: InputTranscriptionCompleted,
+  ): void => this.onInputAudioTranscriptionCompleted(ev);
+  private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError): void =>
+    this.onError(ev);
+  private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
+    this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev);
+  };
+  private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
+    this.agentSession._usageCollector.collect(ev);
+    this.agentSession.emit(
+      AgentSessionEventTypes.MetricsCollected,
+      createMetricsCollectedEvent({ metrics: ev }),
+    );
+  };
+  private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
+    const errorEvent = createErrorEvent(ev, this.interruptionDetector);
+    this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
+    if (!ev.recoverable) {
+      this.agentSession._onError(ev);
+      this.fallbackToVadInterruption();
+      return;
+    }
+    this.agentSession._onError(ev);
+  };
   /** @internal */
   _mainTask?: Task<void>;
-  _userTurnCompletedTask?: Promise<void>;
+  _onEnterTask?: Task<void>;
+  _onExitTask?: Task<void>;
+  _userTurnCompletedTask?: Task<void>;
   constructor(agent: Agent, agentSession: AgentSession) {
     this.agent = agent;
@@ -114,7 +201,7 @@ export class AgentActivity implements RecognitionHooks {
     if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
       this.logger.warn(
-        'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting',
+        'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
       );
       this.turnDetectionMode = undefined;
     }
@@ -177,104 +264,172 @@ export class AgentActivity implements RecognitionHooks {
     if (
       !this.vad &&
       this.stt &&
+      !this.stt.capabilities.streaming &&
       this.llm instanceof LLM &&
       this.allowInterruptions &&
       this.turnDetectionMode === undefined
     ) {
       this.logger.warn(
-        'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
+        'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
           'for more responsive interruption handling.',
       );
     }
+    this.interruptionDetector = this.resolveInterruptionDetector();
+    this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
+    // this allows taking over audio interruption temporarily until interruption is detected
+    // by default is is ture unless turnDetection is manual or realtime_llm
+    this.isInterruptionByAudioActivityEnabled =
+      this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
+    this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
   }
   async start(): Promise<void> {
     const unlock = await this.lock.lock();
     try {
-      this.agent._agentActivity = this;
-      if (this.llm instanceof RealtimeModel) {
-        this.realtimeSession = this.llm.session();
-        this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev));
-        this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
-        this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
-        this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
-          this.onInputAudioTranscriptionCompleted(ev),
-        );
-        this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
-        this.realtimeSession.on('error', (ev) => this.onError(ev));
-        removeInstructions(this.agent._chatCtx);
-        try {
-          await this.realtimeSession.updateInstructions(this.agent.instructions);
-        } catch (error) {
-          this.logger.error(error, 'failed to update the instructions');
-        }
+      await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
+    } finally {
+      unlock();
+    }
+  }
-        try {
-          await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
-        } catch (error) {
-          this.logger.error(error, 'failed to update the chat context');
-        }
+  async resume(): Promise<void> {
+    const unlock = await this.lock.lock();
+    try {
+      await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
+    } finally {
+      unlock();
+    }
+  }
-        try {
-          await this.realtimeSession.updateTools(this.tools);
-        } catch (error) {
-          this.logger.error(error, 'failed to update the tools');
-        }
-      } else if (this.llm instanceof LLM) {
-        try {
-          updateInstructions({
-            chatCtx: this.agent._chatCtx,
-            instructions: this.agent.instructions,
-            addIfMissing: true,
-          });
-        } catch (error) {
-          this.logger.error('failed to update the instructions', error);
-        }
-      }
+  private async _startSession(options: {
+    spanName: 'start_agent_activity' | 'resume_agent_activity';
+    runOnEnter: boolean;
+  }): Promise<void> {
+    const { spanName, runOnEnter } = options;
+    const startSpan = tracer.startSpan({
+      name: spanName,
+      attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+      context: ROOT_CONTEXT,
+    });
+    this.agent._agentActivity = this;
+    if (this.llm instanceof RealtimeModel) {
+      this.realtimeSession = this.llm.session();
+      this.realtimeSpans = new Map<string, Span>();
+      this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
+      this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
+      this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
+      this.realtimeSession.on(
+        'input_audio_transcription_completed',
+        this.onRealtimeInputAudioTranscriptionCompleted,
+      );
+      this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
+      this.realtimeSession.on('error', this.onModelError);
-      // metrics and error handling
-      if (this.llm instanceof LLM) {
-        this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
-        this.llm.on('error', (ev) => this.onError(ev));
+      removeInstructions(this.agent._chatCtx);
+      try {
+        await this.realtimeSession.updateInstructions(this.agent.instructions);
+      } catch (error) {
+        this.logger.error(error, 'failed to update the instructions');
       }
-      if (this.stt instanceof STT) {
-        this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
-        this.stt.on('error', (ev) => this.onError(ev));
+      try {
+        await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
+      } catch (error) {
+        this.logger.error(error, 'failed to update the chat context');
       }
-      if (this.tts instanceof TTS) {
-        this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
-        this.tts.on('error', (ev) => this.onError(ev));
+      try {
+        await this.realtimeSession.updateTools(this.tools);
+      } catch (error) {
+        this.logger.error(error, 'failed to update the tools');
       }
-      if (this.vad instanceof VAD) {
-        this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
+      if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
+        this.logger.error(
+          'audio output is enabled but RealtimeModel has no audio modality ' +
+            'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
+            'or set a TTS model.',
+        );
+      }
+    } else if (this.llm instanceof LLM) {
+      try {
+        updateInstructions({
+          chatCtx: this.agent._chatCtx,
+          instructions: this.agent.instructions,
+          addIfMissing: true,
+        });
+      } catch (error) {
+        this.logger.error('failed to update the instructions', error);
       }
+    }
-      this.audioRecognition = new AudioRecognition({
-        recognitionHooks: this,
-        // Disable stt node if stt is not provided
-        stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
-        vad: this.vad,
-        turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
-        turnDetectionMode: this.turnDetectionMode,
-        minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
-        maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
-      });
-      this.audioRecognition.start();
-      this.started = true;
+    // TODO(parity): Record initial AgentConfigUpdate in chat context
-      this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
-      this.createSpeechTask({
-        task: Task.from(() => this.agent.onEnter()),
+    // metrics and error handling
+    if (this.llm instanceof LLM) {
+      this.llm.on('metrics_collected', this.onMetricsCollected);
+      this.llm.on('error', this.onModelError);
+    }
+    if (this.stt instanceof STT) {
+      this.stt.on('metrics_collected', this.onMetricsCollected);
+      this.stt.on('error', this.onModelError);
+    }
+    if (this.tts instanceof TTS) {
+      this.tts.on('metrics_collected', this.onMetricsCollected);
+      this.tts.on('error', this.onModelError);
+    }
+    if (this.vad instanceof VAD) {
+      this.vad.on('metrics_collected', this.onMetricsCollected);
+    }
+    this.audioRecognition = new AudioRecognition({
+      recognitionHooks: this,
+      // Disable stt node if stt is not provided
+      stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
+      vad: this.vad,
+      turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
+      turnDetectionMode: this.turnDetectionMode,
+      interruptionDetection: this.interruptionDetector,
+      minEndpointingDelay:
+        this.agent.turnHandling?.endpointing?.minDelay ??
+        this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
+      maxEndpointingDelay:
+        this.agent.turnHandling?.endpointing?.maxDelay ??
+        this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
+      rootSpanContext: this.agentSession.rootSpanContext,
+      sttModel: this.stt?.label,
+      sttProvider: this.getSttProvider(),
+      getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
+    });
+    this.audioRecognition.start();
+    this.started = true;
+    this._resumeSchedulingTask();
+    if (runOnEnter) {
+      this._onEnterTask = this.createSpeechTask({
+        taskFn: () =>
+          onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
+            tracer.startActiveSpan(async () => this.agent.onEnter(), {
+              name: 'on_enter',
+              context: trace.setSpan(ROOT_CONTEXT, startSpan),
+              attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+            }),
+          ),
+        inlineTask: true,
         name: 'AgentActivity_onEnter',
       });
-    } finally {
-      unlock();
     }
+    startSpan.end();
   }
   get currentSpeech(): SpeechHandle | undefined {
@@ -289,6 +444,17 @@ export class AgentActivity implements RecognitionHooks {
     return this.agent.stt || this.agentSession.stt;
   }
+  private getSttProvider(): string | undefined {
+    const label = this.stt?.label;
+    if (!label) {
+      return undefined;
+    }
+    // Heuristic: most labels look like "<provider>-<model>"
+    const [provider] = label.split('-', 1);
+    return provider || label;
+  }
   get llm(): LLM | RealtimeModel | undefined {
     return this.agent.llm || this.agentSession.llm;
   }
@@ -301,8 +467,8 @@ export class AgentActivity implements RecognitionHooks {
     return this.agent.toolCtx;
   }
-  get draining(): boolean {
-    return this._draining;
+  get schedulingPaused(): boolean {
+    return this._schedulingPaused;
   }
   get realtimeLLMSession(): RealtimeSession | undefined {
@@ -310,19 +476,48 @@ export class AgentActivity implements RecognitionHooks {
   }
   get allowInterruptions(): boolean {
-    // TODO(AJS-51): Allow options to be defined in Agent class
-    return this.agentSession.options.allowInterruptions;
+    return (
+      this.agent.turnHandling?.interruption?.enabled ??
+      this.agentSession.sessionOptions.turnHandling.interruption.enabled
+    );
+  }
+  get useTtsAlignedTranscript(): boolean {
+    // Agent setting takes precedence over session setting
+    return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
   }
   get turnDetection(): TurnDetectionMode | undefined {
-    // TODO(brian): prioritize using agent.turn_detection
-    return this.agentSession.turnDetection;
+    return this.agent.turnHandling?.turnDetection ?? this.agentSession.turnDetection;
+  }
+  get turnHandling() {
+    return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
   }
+  // get minEndpointingDelay(): number {
+  //   return (
+  //     this.agent.turnHandling?.endpointing?.minDelay ??
+  //     this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
+  //   );
+  // }
+  // get maxEndpointingDelay(): number {
+  //   return (
+  //     this.agent.turnHandling?.endpointing?.maxDelay ??
+  //     this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
+  //   );
+  // }
   get toolCtx(): ToolContext {
     return this.agent.toolCtx;
   }
+  /** @internal */
+  get inputStartedAt() {
+    return this.audioRecognition?.inputStartedAt;
+  }
   async updateChatCtx(chatCtx: ChatContext): Promise<void> {
     chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
@@ -340,7 +535,27 @@ export class AgentActivity implements RecognitionHooks {
     }
   }
-  updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
+  // TODO: Add when AgentConfigUpdate is ported to ChatContext.
+  async updateTools(tools: ToolContext): Promise<void> {
+    this.agent._tools = { ...tools };
+    if (this.realtimeSession) {
+      await this.realtimeSession.updateTools(tools);
+    }
+    if (this.llm instanceof LLM) {
+      // for realtime LLM, we assume the server will remove unvalid tool messages
+      await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
+    }
+  }
+  updateOptions({
+    toolChoice,
+    turnDetection,
+  }: {
+    toolChoice?: ToolChoice | null;
+    turnDetection?: TurnDetectionMode;
+  }): void {
     if (toolChoice !== undefined) {
       this.toolChoice = toolChoice;
     }
@@ -348,43 +563,85 @@ export class AgentActivity implements RecognitionHooks {
     if (this.realtimeSession) {
       this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
     }
+    if (turnDetection !== undefined) {
+      this.turnDetectionMode = turnDetection;
+      this.isDefaultInterruptionByAudioActivityEnabled =
+        this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
+      // sync live flag immediately when not speaking so the change takes effect right away
+      if (this.agentSession.agentState !== 'speaking') {
+        this.isInterruptionByAudioActivityEnabled =
+          this.isDefaultInterruptionByAudioActivityEnabled;
+      }
+    }
+    if (this.audioRecognition) {
+      this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
+    }
   }
   attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
-    if (this.audioStream.isSourceSet) {
-      this.logger.debug('detaching existing audio input in agent activity');
-      this.audioStream.detachSource();
-    }
+    void this.audioStream.close();
+    this.audioStream = new MultiInputStream<AudioFrame>();
+    // Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
+    // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
+    // if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
+    // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
+    const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
+      transform: (frame, controller) => {
+        const shouldDiscardForAecWarmup =
+          this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
+        if (!shouldDiscardForAecWarmup) {
+          controller.enqueue(frame);
+        }
+      },
+    });
-    /**
-     * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
-     * The tee() operation should be applied to the deferred stream, not the original audioStream.
-     * This is important because teeing the original stream directly makes it very difficult—if not
-     * impossible—to implement stream unlock logic cleanly.
-     */
-    this.audioStream.setSource(audioStream);
-    const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
+    this.audioStreamId = this.audioStream.addInputStream(audioStream);
-    if (this.realtimeSession) {
+    if (this.realtimeSession && this.audioRecognition) {
+      const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
+        .pipeThrough(aecWarmupAudioFilter)
+        .tee();
       this.realtimeSession.setInputAudioStream(realtimeAudioStream);
-    }
-    if (this.audioRecognition) {
       this.audioRecognition.setInputAudioStream(recognitionAudioStream);
+    } else if (this.realtimeSession) {
+      this.realtimeSession.setInputAudioStream(
+        this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
+      );
+    } else if (this.audioRecognition) {
+      this.audioRecognition.setInputAudioStream(
+        this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
+      );
     }
   }
   detachAudioInput(): void {
-    this.audioStream.detachSource();
+    if (this.audioStreamId === undefined) {
+      return;
+    }
+    void this.audioStream.close();
+    this.audioStream = new MultiInputStream<AudioFrame>();
+    this.audioStreamId = undefined;
   }
-  commitUserTurn() {
+  commitUserTurn(
+    options: {
+      audioDetached?: boolean;
+      throwIfNotReady?: boolean;
+    } = {},
+  ) {
+    const { audioDetached = false, throwIfNotReady = true } = options;
     if (!this.audioRecognition) {
-      throw new Error('AudioRecognition is not initialized');
+      if (throwIfNotReady) {
+        throw new Error('AudioRecognition is not initialized');
+      }
+      return;
     }
-    // TODO(brian): add audio_detached flag
-    const audioDetached = false;
     this.audioRecognition.commitUserTurn(audioDetached);
   }
@@ -442,14 +699,13 @@ export class AgentActivity implements RecognitionHooks {
       }),
     );
     const task = this.createSpeechTask({
-      task: Task.from((abortController: AbortController) =>
+      taskFn: (abortController: AbortController) =>
         this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
-      ),
       ownedSpeechHandle: handle,
       name: 'AgentActivity.say_tts',
     });
-    task.finally(() => this.onPipelineReplyDone());
+    task.result.finally(() => this.onPipelineReplyDone());
     this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
     return handle;
   }
@@ -463,6 +719,18 @@ export class AgentActivity implements RecognitionHooks {
     if (speechHandle && (ev.type === 'llm_metrics' || ev.type === 'tts_metrics')) {
       ev.speechId = speechHandle.id;
     }
+    // Record realtime metrics on the associated span (if available)
+    if (ev.type === 'realtime_model_metrics' && this.realtimeSpans) {
+      const span = this.realtimeSpans.get(ev.requestId);
+      if (span) {
+        recordRealtimeMetrics(span, ev);
+        this.realtimeSpans.delete(ev.requestId);
+      }
+    }
+    this.agentSession._usageCollector.collect(ev);
     this.agentSession.emit(
       AgentSessionEventTypes.MetricsCollected,
       createMetricsCollectedEvent({ metrics: ev }),
@@ -494,6 +762,13 @@ export class AgentActivity implements RecognitionHooks {
     if (!this.vad) {
       this.agentSession._updateUserState('speaking');
+      if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+        this.audioRecognition.onStartOfOverlapSpeech(
+          0,
+          Date.now(),
+          this.agentSession._userSpeakingSpan,
+        );
+      }
     }
     // this.interrupt() is going to raise when allow_interruptions is False,
@@ -512,6 +787,9 @@ export class AgentActivity implements RecognitionHooks {
     this.logger.info(ev, 'onInputSpeechStopped');
     if (!this.vad) {
+      if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+        this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
+      }
       this.agentSession._updateUserState('listening');
     }
@@ -552,10 +830,9 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    if (this.draining) {
-      // copied from python:
+    if (this.schedulingPaused) {
       // TODO(shubhra): should we "forward" this new turn to the next agent?
-      this.logger.warn('skipping new realtime generation, the agent is draining');
+      this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
       return;
     }
@@ -573,9 +850,8 @@ export class AgentActivity implements RecognitionHooks {
     this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
     this.createSpeechTask({
-      task: Task.from((abortController: AbortController) =>
+      taskFn: (abortController: AbortController) =>
         this.realtimeGenerationTask(handle, ev, {}, abortController),
-      ),
       ownedSpeechHandle: handle,
       name: 'AgentActivity.realtimeGeneration',
     });
@@ -584,13 +860,43 @@ export class AgentActivity implements RecognitionHooks {
   }
   // recognition hooks
-  onStartOfSpeech(_ev: VADEvent): void {
-    this.agentSession._updateUserState('speaking');
+  onStartOfSpeech(ev: VADEvent): void {
+    let speechStartTime = Date.now();
+    if (ev) {
+      // Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
+      speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
+    }
+    this.agentSession._updateUserState('speaking', {
+      lastSpeakingTime: speechStartTime,
+      otelContext: otelContext.active(),
+    });
+    if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+      // Pass speechStartTime as the absolute startedAt timestamp.
+      this.audioRecognition.onStartOfOverlapSpeech(
+        ev.speechDuration,
+        speechStartTime,
+        this.agentSession._userSpeakingSpan,
+      );
+    }
   }
-  onEndOfSpeech(_ev: VADEvent): void {
-    this.agentSession._updateUserState('listening');
+  onEndOfSpeech(ev: VADEvent): void {
+    let speechEndTime = Date.now();
+    if (ev) {
+      // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency.
+      speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
+    }
+    if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+      // Pass speechEndTime as the absolute endedAt timestamp.
+      this.audioRecognition.onEndOfOverlapSpeech(
+        speechEndTime,
+        this.agentSession._userSpeakingSpan,
+      );
+    }
+    this.agentSession._updateUserState('listening', {
+      lastSpeakingTime: speechEndTime,
+      otelContext: otelContext.active(),
+    });
   }
   onVADInferenceDone(ev: VADEvent): void {
@@ -599,20 +905,47 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
-      // skip speech handle interruption if server side turn detection is enabled
+    if (
+      ev.speechDuration >= this.agentSession.sessionOptions.turnHandling.interruption?.minDuration
+    ) {
+      this.interruptByAudioActivity();
+    }
+  }
+  private interruptByAudioActivity(): void {
+    if (!this.isInterruptionByAudioActivityEnabled) {
       return;
     }
-    if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
+    if (this.agentSession._aecWarmupRemaining > 0) {
+      // Disable interruption from audio activity while AEC warmup is active.
       return;
     }
-    if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
-      const text = this.audioRecognition.currentTranscript;
+    if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
+      // skip speech handle interruption if server side turn detection is enabled
+      return;
+    }
+    // Refactored interruption word count check:
+    // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
+    // - Apply check to all STT results: empty string, undefined, or any length
+    // - This ensures consistent behavior across all interruption scenarios
+    if (
+      this.stt &&
+      this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0 &&
+      this.audioRecognition
+    ) {
+      const text = this.audioRecognition.currentTranscript;
       // TODO(shubhra): better word splitting for multi-language
-      if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
+      // Normalize text: convert undefined/null to empty string for consistent word counting
+      const normalizedText = text ?? '';
+      const wordCount = splitWords(normalizedText, true).length;
+      // Only allow interruption if word count meets or exceeds minInterruptionWords
+      // This applies to all cases: empty strings, partial speech, and full speech
+      if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
         return;
       }
     }
@@ -624,12 +957,23 @@ export class AgentActivity implements RecognitionHooks {
       !this._currentSpeech.interrupted &&
       this._currentSpeech.allowInterruptions
     ) {
-      this.logger.info({ 'speech id': this._currentSpeech.id }, 'speech interrupted by VAD');
+      this.logger.info(
+        { 'speech id': this._currentSpeech.id },
+        'speech interrupted by audio activity',
+      );
       this.realtimeSession?.interrupt();
       this._currentSpeech.interrupt();
     }
   }
+  onInterruption(ev: OverlappingSpeechEvent) {
+    this.restoreInterruptionByAudioActivity();
+    this.interruptByAudioActivity();
+    if (this.audioRecognition) {
+      this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
+    }
+  }
   onInterimTranscript(ev: SpeechEvent): void {
     if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
       // skip stt transcription if userTranscription is enabled on the realtime model
@@ -641,9 +985,14 @@ export class AgentActivity implements RecognitionHooks {
       createUserInputTranscribedEvent({
         transcript: ev.alternatives![0].text,
         isFinal: false,
+        language: ev.alternatives![0].language,
         // TODO(AJS-106): add multi participant support
       }),
     );
+    if (ev.alternatives![0].text) {
+      this.interruptByAudioActivity();
+    }
   }
   onFinalTranscript(ev: SpeechEvent): void {
@@ -657,17 +1006,103 @@ export class AgentActivity implements RecognitionHooks {
       createUserInputTranscribedEvent({
         transcript: ev.alternatives![0].text,
         isFinal: true,
+        language: ev.alternatives![0].language,
         // TODO(AJS-106): add multi participant support
       }),
     );
+    // agent speech might not be interrupted if VAD failed and a final transcript is received
+    // we call interruptByAudioActivity (idempotent) to pause the speech, if possible
+    if (
+      this.audioRecognition &&
+      this.turnDetection !== 'manual' &&
+      this.turnDetection !== 'realtime_llm'
+    ) {
+      this.interruptByAudioActivity();
+      // TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
+    }
+    // TODO: resume false interruption - start interrupt paused speech task
+  }
+  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
+    if (
+      !this.agentSession.sessionOptions.preemptiveGeneration ||
+      this.schedulingPaused ||
+      (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
+      !(this.llm instanceof LLM)
+    ) {
+      return;
+    }
+    this.cancelPreemptiveGeneration();
+    this.logger.info(
+      {
+        newTranscript: info.newTranscript,
+        transcriptConfidence: info.transcriptConfidence,
+      },
+      'starting preemptive generation',
+    );
+    const userMessage = ChatMessage.create({
+      role: 'user',
+      content: info.newTranscript,
+      transcriptConfidence: info.transcriptConfidence,
+    });
+    const chatCtx = this.agent.chatCtx.copy();
+    const speechHandle = this.generateReply({
+      userMessage,
+      chatCtx,
+      scheduleSpeech: false,
+    });
+    this._preemptiveGeneration = {
+      speechHandle,
+      userMessage,
+      info,
+      chatCtx: chatCtx.copy(),
+      tools: { ...this.tools },
+      toolChoice: this.toolChoice,
+      createdAt: Date.now(),
+    };
+  }
+  private cancelPreemptiveGeneration(): void {
+    if (this._preemptiveGeneration !== undefined) {
+      this._preemptiveGeneration.speechHandle._cancel();
+      this._preemptiveGeneration = undefined;
+    }
   }
   private createSpeechTask(options: {
-    task: Task<void>;
+    taskFn: (controller: AbortController) => Promise<void>;
+    controller?: AbortController;
     ownedSpeechHandle?: SpeechHandle;
+    inlineTask?: boolean;
     name?: string;
-  }): Promise<void> {
-    const { task, ownedSpeechHandle } = options;
+  }): Task<void> {
+    const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
+    const wrappedFn = (ctrl: AbortController) => {
+      return agentActivityStorage.run(this, () => {
+        // Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
+        // before post-construction metadata is attached to the Task instance.
+        const currentTask = Task.current();
+        if (currentTask) {
+          _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
+        }
+        if (ownedSpeechHandle) {
+          return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
+        }
+        return taskFn(ctrl);
+      });
+    };
+    const task = Task.from(wrappedFn, controller, name);
+    _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
     this.speechTasks.add(task);
     task.addDoneCallback(() => {
@@ -687,34 +1122,50 @@ export class AgentActivity implements RecognitionHooks {
       this.wakeupMainTask();
     });
-    return task.result;
+    return task;
   }
   async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
-    if (this.draining) {
-      this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
-      // copied from python:
+    if (this.schedulingPaused) {
+      this.cancelPreemptiveGeneration();
+      this.logger.warn(
+        { user_input: info.newTranscript },
+        'skipping user input, speech scheduling is paused',
+      );
       // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
       return true;
     }
+    // Refactored interruption word count check for consistency with onVADInferenceDone:
+    // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
+    // - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
     if (
       this.stt &&
       this.turnDetection !== 'manual' &&
       this._currentSpeech &&
       this._currentSpeech.allowInterruptions &&
       !this._currentSpeech.interrupted &&
-      this.agentSession.options.minInterruptionWords > 0 &&
-      info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
+      this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0
     ) {
-      // avoid interruption if the new_transcript is too short
-      this.logger.info('skipping user input, new_transcript is too short');
-      return false;
+      const wordCount = splitWords(info.newTranscript, true).length;
+      if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
+        // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
+        this.cancelPreemptiveGeneration();
+        this.logger.info(
+          {
+            wordCount,
+            minInterruptionWords:
+              this.agentSession.sessionOptions.turnHandling.interruption.minWords,
+          },
+          'skipping user input, word count below minimum interruption threshold',
+        );
+        return false;
+      }
     }
     const oldTask = this._userTurnCompletedTask;
     this._userTurnCompletedTask = this.createSpeechTask({
-      task: Task.from(() => this.userTurnCompleted(info, oldTask)),
+      taskFn: () => this.userTurnCompleted(info, oldTask),
       name: 'AgentActivity.userTurnCompleted',
     });
     return true;
@@ -744,16 +1195,28 @@ export class AgentActivity implements RecognitionHooks {
           throw new Error('Speech queue is empty');
         }
         const speechHandle = heapItem[2];
+        // Skip speech handles that were already interrupted/done before being
+        // picked up from the queue (e.g. interrupted during shutdown before the
+        // main loop had a chance to process them). Calling _authorizeGeneration
+        // on a done handle would create a generation Future that nobody resolves,
+        // causing the main loop to hang forever.
+        if (speechHandle.interrupted || speechHandle.done()) {
+          continue;
+        }
         this._currentSpeech = speechHandle;
         speechHandle._authorizeGeneration();
-        await speechHandle._waitForGeneration();
+        await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
         this._currentSpeech = undefined;
       }
-      // If we're draining and there are no more speech tasks, we can exit.
-      // Only speech tasks can bypass draining to create a tool response
-      if (this.draining && this.speechTasks.size === 0) {
-        this.logger.info('mainTask: draining and no more speech tasks');
+      // if we're draining/pausing and there are no more speech tasks, we can exit.
+      // only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
+      const toWait = this.getDrainPendingSpeechTasks();
+      if (this._schedulingPaused && toWait.length === 0) {
+        this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
         break;
       }
@@ -763,6 +1226,39 @@ export class AgentActivity implements RecognitionHooks {
     this.logger.info('AgentActivity mainTask: exiting');
   }
+  private getDrainPendingSpeechTasks(): Task<void>[] {
+    const blockedHandles: SpeechHandle[] = [];
+    for (const task of this._drainBlockedTasks) {
+      const info = _getActivityTaskInfo(task);
+      if (!info) {
+        this.logger.error('blocked task without activity info; skipping.');
+        continue;
+      }
+      if (!info.speechHandle) {
+        continue; // onEnter/onExit
+      }
+      blockedHandles.push(info.speechHandle);
+    }
+    const toWait: Task<void>[] = [];
+    for (const task of this.speechTasks) {
+      if (this._drainBlockedTasks.includes(task)) {
+        continue;
+      }
+      const info = _getActivityTaskInfo(task);
+      if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
+        continue;
+      }
+      toWait.push(task);
+    }
+    return toWait;
+  }
   private wakeupMainTask(): void {
     this.q_updated.resolve();
   }
@@ -773,6 +1269,7 @@ export class AgentActivity implements RecognitionHooks {
     instructions?: string;
     toolChoice?: ToolChoice | null;
     allowInterruptions?: boolean;
+    scheduleSpeech?: boolean;
   }): SpeechHandle {
     const {
       userMessage,
@@ -780,6 +1277,7 @@ export class AgentActivity implements RecognitionHooks {
       instructions: defaultInstructions,
       toolChoice: defaultToolChoice,
       allowInterruptions: defaultAllowInterruptions,
+      scheduleSpeech = true,
     } = options;
     let instructions = defaultInstructions;
@@ -802,7 +1300,7 @@ export class AgentActivity implements RecognitionHooks {
       throw new Error('trying to generate reply without an LLM model');
     }
-    const functionCall = asyncLocalStorage.getStore()?.functionCall;
+    const functionCall = functionCallStorage.getStore()?.functionCall;
     if (toolChoice === undefined && functionCall !== undefined) {
       // when generateReply is called inside a tool, set toolChoice to 'none' by default
       toolChoice = 'none';
@@ -824,7 +1322,7 @@ export class AgentActivity implements RecognitionHooks {
     if (this.llm instanceof RealtimeModel) {
       this.createSpeechTask({
-        task: Task.from((abortController: AbortController) =>
+        taskFn: (abortController: AbortController) =>
           this.realtimeReplyTask({
             speechHandle: handle,
             // TODO(brian): support llm.ChatMessage for the realtime model
@@ -836,7 +1334,6 @@ export class AgentActivity implements RecognitionHooks {
             },
             abortController,
           }),
-        ),
         ownedSpeechHandle: handle,
         name: 'AgentActivity.realtimeReply',
       });
@@ -848,46 +1345,80 @@ export class AgentActivity implements RecognitionHooks {
         instructions = `${this.agent.instructions}\n${instructions}`;
       }
+      // Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
+      const onEnterData = onEnterStorage.getStore();
+      const shouldFilterTools =
+        onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
+      const tools = shouldFilterTools
+        ? Object.fromEntries(
+            Object.entries(this.agent.toolCtx).filter(
+              ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
+            ),
+          )
+        : this.agent.toolCtx;
       const task = this.createSpeechTask({
-        task: Task.from((abortController: AbortController) =>
+        taskFn: (abortController: AbortController) =>
           this.pipelineReplyTask(
             handle,
             chatCtx ?? this.agent.chatCtx,
-            this.agent.toolCtx,
+            tools,
             {
               toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
             },
             abortController,
-            instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
+            instructions,
             userMessage,
           ),
-        ),
         ownedSpeechHandle: handle,
         name: 'AgentActivity.pipelineReply',
       });
-      task.finally(() => this.onPipelineReplyDone());
+      task.result.finally(() => this.onPipelineReplyDone());
     }
-    this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    if (scheduleSpeech) {
+      this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    }
     return handle;
   }
-  interrupt(): Future<void> {
+  interrupt(options: { force?: boolean } = {}): Future<void> {
+    const { force = false } = options;
+    this.cancelPreemptiveGeneration();
     const future = new Future<void>();
     const currentSpeech = this._currentSpeech;
     //TODO(AJS-273): add interrupt for background speeches
-    currentSpeech?.interrupt();
+    currentSpeech?.interrupt(force);
     for (const [_, __, speech] of this.speechQueue) {
-      speech.interrupt();
+      speech.interrupt(force);
     }
     this.realtimeSession?.interrupt();
-    if (currentSpeech === undefined) {
+    if (force) {
+      // Force-interrupt (used during shutdown): cancel all speech tasks so they
+      // don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
+      // when the room is disconnected). Mark the current speech as done immediately
+      // so the interrupt future resolves without waiting for tasks to finish.
+      // Clear the queue so mainTask doesn't dequeue already-interrupted handles
+      // and hang on _waitForGeneration() (the generation future created by
+      // _authorizeGeneration would never resolve since _markDone is a no-op
+      // once doneFut is already settled).
+      for (const task of this.speechTasks) {
+        task.cancel();
+      }
+      if (currentSpeech && !currentSpeech.done()) {
+        currentSpeech._markDone();
+      }
+      this.speechQueue.clear();
+      future.resolve();
+    } else if (currentSpeech === undefined) {
       future.resolve();
     } else {
       currentSpeech.addDoneCallback(() => {
@@ -905,13 +1436,13 @@ export class AgentActivity implements RecognitionHooks {
     }
   }
-  private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise<void>): Promise<void> {
+  private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
     if (oldTask) {
       // We never cancel user code as this is very confusing.
       // So we wait for the old execution of onUserTurnCompleted to finish.
       // In practice this is OK because most speeches will be interrupted if a new turn
       // is detected. So the previous execution should complete quickly.
-      await oldTask;
+      await oldTask.result;
     }
     // When the audio recognition detects the end of a user turn:
@@ -949,6 +1480,7 @@ export class AgentActivity implements RecognitionHooks {
     let userMessage: ChatMessage | undefined = ChatMessage.create({
       role: 'user',
       content: info.newTranscript,
+      transcriptConfidence: info.transcriptConfidence,
     });
     // create a temporary mutable chat context to pass to onUserTurnCompleted
@@ -975,16 +1507,74 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    // Ensure the new message is passed to generateReply
-    // This preserves the original message id, making it easier for users to track responses
-    const speechHandle = this.generateReply({ userMessage, chatCtx });
+    const userMetricsReport: MetricsReport = {};
+    if (info.startedSpeakingAt !== undefined) {
+      userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1000; // ms -> seconds
+    }
+    if (info.stoppedSpeakingAt !== undefined) {
+      userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1000; // ms -> seconds
+    }
+    if (info.transcriptionDelay !== undefined) {
+      userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1000; // ms -> seconds
+    }
+    if (info.endOfUtteranceDelay !== undefined) {
+      userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1000; // ms -> seconds
+    }
+    userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1000; // ms -> seconds
+    if (userMessage) {
+      userMessage.metrics = userMetricsReport;
+    }
-    const eouMetrics: EOUMetrics = {
-      type: 'eou_metrics',
-      timestamp: Date.now(),
-      endOfUtteranceDelay: info.endOfUtteranceDelay,
-      transcriptionDelay: info.transcriptionDelay,
-      onUserTurnCompletedDelay: callbackDuration,
+    let speechHandle: SpeechHandle | undefined;
+    if (this._preemptiveGeneration !== undefined) {
+      const preemptive = this._preemptiveGeneration;
+      // make sure the onUserTurnCompleted didn't change some request parameters
+      // otherwise invalidate the preemptive generation
+      if (
+        preemptive.info.newTranscript === userMessage?.textContent &&
+        preemptive.chatCtx.isEquivalent(chatCtx) &&
+        isSameToolContext(preemptive.tools, this.tools) &&
+        isSameToolChoice(preemptive.toolChoice, this.toolChoice)
+      ) {
+        speechHandle = preemptive.speechHandle;
+        // The preemptive userMessage was created without metrics.
+        // Copy the metrics and transcriptConfidence from the new userMessage
+        // to the preemptive message BEFORE scheduling (so the pipeline inserts
+        // the message with metrics already set).
+        if (preemptive.userMessage && userMessage) {
+          preemptive.userMessage.metrics = userMetricsReport;
+          preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
+        }
+        this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+        this.logger.debug(
+          {
+            preemptiveLeadTime: Date.now() - preemptive.createdAt,
+          },
+          'using preemptive generation',
+        );
+      } else {
+        this.logger.warn(
+          'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
+        );
+        preemptive.speechHandle._cancel();
+      }
+      this._preemptiveGeneration = undefined;
+    }
+    if (speechHandle === undefined) {
+      // Ensure the new message is passed to generateReply
+      // This preserves the original message id, making it easier for users to track responses
+      speechHandle = this.generateReply({ userMessage, chatCtx });
+    }
+    const eouMetrics: EOUMetrics = {
+      type: 'eou_metrics',
+      timestamp: Date.now(),
+      endOfUtteranceDelayMs: info.endOfUtteranceDelay,
+      transcriptionDelayMs: info.transcriptionDelay,
+      onUserTurnCompletedDelayMs: callbackDuration,
+      lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
       speechId: speechHandle.id,
     };
@@ -1002,6 +1592,8 @@ export class AgentActivity implements RecognitionHooks {
     replyAbortController: AbortController,
     audio?: ReadableStream<AudioFrame> | null,
   ): Promise<void> {
+    speechHandle._agentTurnContext = otelContext.active();
     speechHandleStorage.enterWith(speechHandle);
     const transcriptionOutput = this.agentSession.output.transcriptionEnabled
@@ -1046,28 +1638,44 @@ export class AgentActivity implements RecognitionHooks {
       tasks.push(textForwardTask);
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    let replyStartedSpeakingAt: number | undefined;
+    let replyTtsGenData: _TTSGenerationData | null = null;
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
+      if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+        this.audioRecognition.onStartOfAgentSpeech();
+        this.isInterruptionByAudioActivityEnabled = false;
+      }
     };
     if (!audioOutput) {
       if (textOut) {
-        textOut.firstTextFut.await.finally(onFirstFrame);
+        textOut.firstTextFut.await
+          .then(() => onFirstFrame())
+          .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
       }
     } else {
       let audioOut: _AudioOut | null = null;
       if (!audio) {
         // generate audio using TTS
-        const [ttsTask, ttsStream] = performTTSInference(
+        const [ttsTask, ttsGenData] = performTTSInference(
           (...args) => this.agent.ttsNode(...args),
           audioSource,
           modelSettings,
           replyAbortController,
+          this.tts?.model,
+          this.tts?.provider,
         );
         tasks.push(ttsTask);
+        replyTtsGenData = ttsGenData;
         const [forwardTask, _audioOut] = performAudioForwarding(
-          ttsStream,
+          ttsGenData.audioStream,
           audioOutput,
           replyAbortController,
         );
@@ -1083,7 +1691,9 @@ export class AgentActivity implements RecognitionHooks {
         tasks.push(forwardTask);
         audioOut = _audioOut;
       }
-      audioOut.firstFrameFut.await.finally(onFirstFrame);
+      audioOut.firstFrameFut.await
+        .then((ts) => onFirstFrame(ts))
+        .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
     }
     await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
@@ -1102,10 +1712,21 @@ export class AgentActivity implements RecognitionHooks {
     }
     if (addToChatCtx) {
+      const replyStoppedSpeakingAt = Date.now();
+      const replyAssistantMetrics: MetricsReport = {};
+      if (replyTtsGenData?.ttfb !== undefined) {
+        replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
+      }
+      if (replyStartedSpeakingAt !== undefined) {
+        replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds
+        replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds
+      }
       const message = ChatMessage.create({
         role: 'assistant',
         content: textOut?.text || '',
         interrupted: speechHandle.interrupted,
+        metrics: replyAssistantMetrics,
       });
       this.agent._chatCtx.insert(message);
       this.agentSession._conversationItemAdded(message);
@@ -1113,19 +1734,51 @@ export class AgentActivity implements RecognitionHooks {
     if (this.agentSession.agentState === 'speaking') {
       this.agentSession._updateAgentState('listening');
+      if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+        this.audioRecognition.onEndOfAgentSpeech(Date.now());
+      }
+      this.restoreInterruptionByAudioActivity();
     }
   }
-  private async pipelineReplyTask(
-    speechHandle: SpeechHandle,
-    chatCtx: ChatContext,
-    toolCtx: ToolContext,
-    modelSettings: ModelSettings,
-    replyAbortController: AbortController,
-    instructions?: string,
-    newMessage?: ChatMessage,
-    toolsMessages?: ChatItem[],
-  ): Promise<void> {
+  private _pipelineReplyTaskImpl = async ({
+    speechHandle,
+    chatCtx,
+    toolCtx,
+    modelSettings,
+    replyAbortController,
+    instructions,
+    newMessage,
+    toolsMessages,
+    span,
+    _previousUserMetrics,
+  }: {
+    speechHandle: SpeechHandle;
+    chatCtx: ChatContext;
+    toolCtx: ToolContext;
+    modelSettings: ModelSettings;
+    replyAbortController: AbortController;
+    instructions?: string;
+    newMessage?: ChatMessage;
+    toolsMessages?: ChatItem[];
+    span: Span;
+    _previousUserMetrics?: MetricsReport;
+  }): Promise<void> => {
+    speechHandle._agentTurnContext = otelContext.active();
+    span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
+    if (instructions) {
+      span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
+    }
+    if (newMessage) {
+      span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
+    }
+    const localParticipant = this.agentSession._roomIO?.localParticipant;
+    if (localParticipant) {
+      setParticipantSpanAttributes(span, localParticipant);
+    }
     speechHandleStorage.enterWith(speechHandle);
     const audioOutput = this.agentSession.output.audioEnabled
@@ -1137,10 +1790,9 @@ export class AgentActivity implements RecognitionHooks {
     chatCtx = chatCtx.copy();
+    // Insert new message into temporary chat context for LLM inference
     if (newMessage) {
       chatCtx.insert(newMessage);
-      this.agent._chatCtx.insert(newMessage);
-      this.agentSession._conversationItemAdded(newMessage);
     }
     if (instructions) {
@@ -1155,7 +1807,6 @@ export class AgentActivity implements RecognitionHooks {
       }
     }
-    this.agentSession._updateAgentState('thinking');
     const tasks: Array<Task<void>> = [];
     const [llmTask, llmGenData] = performLLMInference(
       // preserve  `this` context in llmNode
@@ -1164,25 +1815,43 @@ export class AgentActivity implements RecognitionHooks {
       toolCtx,
       modelSettings,
       replyAbortController,
+      this.llm?.model,
+      this.llm?.provider,
     );
     tasks.push(llmTask);
-    const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
     let ttsTask: Task<void> | null = null;
-    let ttsStream: ReadableStream<AudioFrame> | null = null;
+    let ttsGenData: _TTSGenerationData | null = null;
+    let llmOutput: ReadableStream<string>;
     if (audioOutput) {
-      [ttsTask, ttsStream] = performTTSInference(
+      // Only tee the stream when we need TTS
+      const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
+      llmOutput = textOutput;
+      [ttsTask, ttsGenData] = performTTSInference(
         (...args) => this.agent.ttsNode(...args),
         ttsTextInput,
         modelSettings,
         replyAbortController,
+        this.tts?.model,
+        this.tts?.provider,
       );
       tasks.push(ttsTask);
+    } else {
+      // No TTS needed, use the stream directly
+      llmOutput = llmGenData.textStream;
     }
     await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
+    let userMetrics: MetricsReport | undefined = _previousUserMetrics;
+    // Add new message to actual chat context if the speech is scheduled
+    if (newMessage && speechHandle.scheduled) {
+      this.agent._chatCtx.insert(newMessage);
+      this.agentSession._conversationItemAdded(newMessage);
+      userMetrics = newMessage.metrics;
+    }
     if (speechHandle.interrupted) {
       replyAbortController.abort();
       await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -1195,7 +1864,26 @@ export class AgentActivity implements RecognitionHooks {
     speechHandle._clearAuthorization();
     const replyStartedAt = Date.now();
-    const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
+    // Determine the transcription input source
+    let transcriptionInput: ReadableStream<string | TimedString> = llmOutput;
+    // Check if we should use TTS aligned transcripts
+    if (this.useTtsAlignedTranscript && this.tts?.capabilities.alignedTranscript && ttsGenData) {
+      // Race timedTextsFut with ttsTask to avoid hanging if TTS fails before resolving the future
+      const timedTextsStream = await Promise.race([
+        ttsGenData.timedTextsFut.await,
+        ttsTask?.result.catch(() =>
+          this.logger.warn('TTS task failed before resolving timedTextsFut'),
+        ) ?? Promise.resolve(),
+      ]);
+      if (timedTextsStream) {
+        this.logger.debug('Using TTS aligned transcripts for transcription node input');
+        transcriptionInput = timedTextsStream;
+      }
+    }
+    const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
     let textOut: _TextOut | null = null;
     if (trNodeResult) {
       const [textForwardTask, _textOut] = performTextForwarding(
@@ -1207,37 +1895,54 @@ export class AgentActivity implements RecognitionHooks {
       textOut = _textOut;
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    let agentStartedSpeakingAt: number | undefined;
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
+      if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+        this.audioRecognition.onStartOfAgentSpeech();
+        this.isInterruptionByAudioActivityEnabled = false;
+      }
     };
     let audioOut: _AudioOut | null = null;
     if (audioOutput) {
-      if (ttsStream) {
+      if (ttsGenData) {
         const [forwardTask, _audioOut] = performAudioForwarding(
-          ttsStream,
+          ttsGenData.audioStream,
           audioOutput,
           replyAbortController,
         );
         audioOut = _audioOut;
         tasks.push(forwardTask);
-        audioOut.firstFrameFut.await.finally(onFirstFrame);
+        audioOut.firstFrameFut.await
+          .then((ts) => onFirstFrame(ts))
+          .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
       } else {
-        throw Error('ttsStream is null when audioOutput is enabled');
+        throw Error('ttsGenData is null when audioOutput is enabled');
       }
     } else {
-      textOut?.firstTextFut.await.finally(onFirstFrame);
+      textOut?.firstTextFut.await
+        .then(() => onFirstFrame())
+        .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
     }
     //TODO(AJS-272): before executing tools, make sure we generated all the text
     // (this ensure everything is kept ordered)
-    const onToolExecutionStarted = (_: FunctionCall) => {
-      // TODO(brian): handle speech_handle item_added
+    const onToolExecutionStarted = (f: FunctionCall) => {
+      speechHandle._itemAdded([f]);
+      this.agent._chatCtx.items.push(f);
+      this.agentSession._toolItemsAdded([f]);
     };
-    const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
-      // TODO(brian): handle speech_handle item_added
+    const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
+      if (out.toolCallOutput) {
+        speechHandle._itemAdded([out.toolCallOutput]);
+      }
     };
     const [executeToolsTask, toolOutput] = performToolExecutions({
@@ -1257,12 +1962,45 @@ export class AgentActivity implements RecognitionHooks {
       await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
     }
+    const agentStoppedSpeakingAt = Date.now();
+    const assistantMetrics: MetricsReport = {};
+    if (llmGenData.ttft !== undefined) {
+      assistantMetrics.llmNodeTtft = llmGenData.ttft; // already in seconds
+    }
+    if (ttsGenData?.ttfb !== undefined) {
+      assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb; // already in seconds
+    }
+    if (agentStartedSpeakingAt !== undefined) {
+      assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds
+      assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds
+      if (userMetrics?.stoppedSpeakingAt !== undefined) {
+        const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt;
+        assistantMetrics.e2eLatency = e2eLatency;
+        span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
+      }
+    }
+    span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
+    let hasSpeechMessage = false;
     // add the tools messages that triggers this reply to the chat context
     if (toolsMessages) {
       for (const msg of toolsMessages) {
         msg.createdAt = replyStartedAt;
       }
-      this.agent._chatCtx.insert(toolsMessages);
+      // Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
+      // were already added by onToolExecutionStarted when the tool execution began.
+      // Inserting function_calls again would create duplicates that break provider APIs
+      // (e.g. Google's "function response parts != function call parts" error).
+      const toolCallOutputs = toolsMessages.filter(
+        (m): m is FunctionCallOutput => m.type === 'function_call_output',
+      );
+      if (toolCallOutputs.length > 0) {
+        this.agent._chatCtx.insert(toolCallOutputs);
+        this.agentSession._toolItemsAdded(toolCallOutputs);
+      }
     }
     if (speechHandle.interrupted) {
@@ -1270,20 +2008,24 @@ export class AgentActivity implements RecognitionHooks {
         { speech_id: speechHandle.id },
         'Aborting all pipeline reply tasks due to interruption',
       );
+      // Stop playout ASAP (don't wait for cancellations), otherwise the segment may finish and we
+      // will correctly (but undesirably) commit a long transcript even though the user said "stop".
+      if (audioOutput) {
+        audioOutput.clearBuffer();
+      }
       replyAbortController.abort();
-      await Promise.allSettled(
-        tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
-      );
+      await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       let forwardedText = textOut?.text || '';
       if (audioOutput) {
-        audioOutput.clearBuffer();
         const playbackEv = await audioOutput.waitForPlayout();
-        if (audioOut?.firstFrameFut.done) {
+        if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
           // playback EV is valid only if the first frame was already played
           this.logger.info(
-            { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+            { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
             'playout interrupted',
           );
           if (playbackEv.synchronizedTranscript) {
@@ -1295,43 +2037,54 @@ export class AgentActivity implements RecognitionHooks {
       }
       if (forwardedText) {
+        hasSpeechMessage = true;
         const message = ChatMessage.create({
           role: 'assistant',
           content: forwardedText,
           id: llmGenData.id,
           interrupted: true,
           createdAt: replyStartedAt,
+          metrics: assistantMetrics,
         });
         chatCtx.insert(message);
         this.agent._chatCtx.insert(message);
+        speechHandle._itemAdded([message]);
         this.agentSession._conversationItemAdded(message);
+        span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
       }
       if (this.agentSession.agentState === 'speaking') {
         this.agentSession._updateAgentState('listening');
+        if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+          this.audioRecognition.onEndOfAgentSpeech(Date.now());
+          this.restoreInterruptionByAudioActivity();
+        }
       }
       this.logger.info(
         { speech_id: speechHandle.id, message: forwardedText },
         'playout completed with interrupt',
       );
-      // TODO(shubhra) add chat message to speech handle
       speechHandle._markGenerationDone();
       await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       return;
     }
     if (textOut && textOut.text) {
+      hasSpeechMessage = true;
       const message = ChatMessage.create({
         role: 'assistant',
         id: llmGenData.id,
         interrupted: false,
         createdAt: replyStartedAt,
         content: textOut.text,
+        metrics: assistantMetrics,
       });
       chatCtx.insert(message);
       this.agent._chatCtx.insert(message);
+      speechHandle._itemAdded([message]);
       this.agentSession._conversationItemAdded(message);
+      span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
       this.logger.info(
         { speech_id: speechHandle.id, message: textOut.text },
         'playout completed without interruption',
@@ -1342,6 +2095,12 @@ export class AgentActivity implements RecognitionHooks {
       this.agentSession._updateAgentState('thinking');
     } else if (this.agentSession.agentState === 'speaking') {
       this.agentSession._updateAgentState('listening');
+      if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
+        {
+          this.audioRecognition.onEndOfAgentSpeech(Date.now());
+          this.restoreInterruptionByAudioActivity();
+        }
+      }
     }
     // mark the playout done before waiting for the tool execution
@@ -1351,7 +2110,7 @@ export class AgentActivity implements RecognitionHooks {
     if (toolOutput.output.length === 0) return;
     // important: no agent output should be used after this point
-    const { maxToolSteps } = this.agentSession.options;
+    const { maxToolSteps } = this.agentSession.sessionOptions;
     if (speechHandle.numSteps >= maxToolSteps) {
       this.logger.warn(
         { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -1360,52 +2119,18 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
-      functionCalls: [],
-      functionCallOutputs: [],
-    });
-    let shouldGenerateToolReply: boolean = false;
-    let newAgentTask: Agent | null = null;
-    let ignoreTaskSwitch: boolean = false;
-    for (const sanitizedOut of toolOutput.output) {
-      if (sanitizedOut.toolCallOutput !== undefined) {
-        functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
-        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
-        if (sanitizedOut.replyRequired) {
-          shouldGenerateToolReply = true;
-        }
-      }
-      if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
-        this.logger.error('expected to receive only one agent task from the tool executions');
-        ignoreTaskSwitch = true;
-        // TODO(brian): should we mark the function call as failed to notify the LLM?
-      }
-      newAgentTask = sanitizedOut.agentTask ?? null;
-      this.logger.debug(
-        {
-          speechId: speechHandle.id,
-          name: sanitizedOut.toolCall?.name,
-          args: sanitizedOut.toolCall.args,
-          output: sanitizedOut.toolCallOutput?.output,
-          isError: sanitizedOut.toolCallOutput?.isError,
-        },
-        'Tool call execution finished',
-      );
-    }
+    const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
+      this.summarizeToolExecutionOutput(toolOutput, speechHandle);
     this.agentSession.emit(
       AgentSessionEventTypes.FunctionToolsExecuted,
       functionToolsExecutedEvent,
     );
-    let draining = this.draining;
+    let schedulingPaused = this.schedulingPaused;
     if (!ignoreTaskSwitch && newAgentTask !== null) {
       this.agentSession.updateAgent(newAgentTask);
-      draining = true;
+      schedulingPaused = true;
     }
     const toolMessages = [
@@ -1415,28 +2140,19 @@ export class AgentActivity implements RecognitionHooks {
     if (shouldGenerateToolReply) {
       chatCtx.insert(toolMessages);
-      const handle = SpeechHandle.create({
-        allowInterruptions: speechHandle.allowInterruptions,
-        stepIndex: speechHandle._stepIndex + 1,
-        parent: speechHandle,
-      });
-      this.agentSession.emit(
-        AgentSessionEventTypes.SpeechCreated,
-        createSpeechCreatedEvent({
-          userInitiated: false,
-          source: 'tool_response',
-          speechHandle: handle,
-        }),
-      );
+      // Increment step count on SAME handle (parity with Python agent_activity.py L2081)
+      speechHandle._numSteps += 1;
       // Avoid setting tool_choice to "required" or a specific function when
       // passing tool response back to the LLM
-      const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
+      const respondToolChoice =
+        schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
+      // Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
       const toolResponseTask = this.createSpeechTask({
-        task: Task.from(() =>
+        taskFn: () =>
           this.pipelineReplyTask(
-            handle,
+            speechHandle,
             chatCtx,
             toolCtx,
             { toolChoice: respondToolChoice },
@@ -1444,22 +2160,61 @@ export class AgentActivity implements RecognitionHooks {
             instructions,
             undefined,
             toolMessages,
+            hasSpeechMessage ? undefined : userMetrics,
           ),
-        ),
-        ownedSpeechHandle: handle,
+        ownedSpeechHandle: speechHandle,
         name: 'AgentActivity.pipelineReply',
       });
-      toolResponseTask.finally(() => this.onPipelineReplyDone());
+      toolResponseTask.result.finally(() => this.onPipelineReplyDone());
-      this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
+      this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
     } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
       for (const msg of toolMessages) {
         msg.createdAt = replyStartedAt;
       }
-      this.agent._chatCtx.insert(toolMessages);
+      const toolCallOutputs = toolMessages.filter(
+        (m): m is FunctionCallOutput => m.type === 'function_call_output',
+      );
+      if (toolCallOutputs.length > 0) {
+        this.agent._chatCtx.insert(toolCallOutputs);
+        this.agentSession._toolItemsAdded(toolCallOutputs);
+      }
     }
-  }
+  };
+  private pipelineReplyTask = async (
+    speechHandle: SpeechHandle,
+    chatCtx: ChatContext,
+    toolCtx: ToolContext,
+    modelSettings: ModelSettings,
+    replyAbortController: AbortController,
+    instructions?: string,
+    newMessage?: ChatMessage,
+    toolsMessages?: ChatItem[],
+    _previousUserMetrics?: MetricsReport,
+  ): Promise<void> =>
+    tracer.startActiveSpan(
+      async (span) =>
+        this._pipelineReplyTaskImpl({
+          speechHandle,
+          chatCtx,
+          toolCtx,
+          modelSettings,
+          replyAbortController,
+          instructions,
+          newMessage,
+          toolsMessages,
+          span,
+          _previousUserMetrics,
+        }),
+      {
+        name: 'agent_turn',
+        context: this.agentSession.rootSpanContext,
+      },
+    );
   private async realtimeGenerationTask(
     speechHandle: SpeechHandle,
@@ -1467,6 +2222,44 @@ export class AgentActivity implements RecognitionHooks {
     modelSettings: ModelSettings,
     replyAbortController: AbortController,
   ): Promise<void> {
+    return tracer.startActiveSpan(
+      async (span) =>
+        this._realtimeGenerationTaskImpl({
+          speechHandle,
+          ev,
+          modelSettings,
+          replyAbortController,
+          span,
+        }),
+      {
+        name: 'agent_turn',
+        context: this.agentSession.rootSpanContext,
+      },
+    );
+  }
+  private async _realtimeGenerationTaskImpl({
+    speechHandle,
+    ev,
+    modelSettings,
+    replyAbortController,
+    span,
+  }: {
+    speechHandle: SpeechHandle;
+    ev: GenerationCreatedEvent;
+    modelSettings: ModelSettings;
+    replyAbortController: AbortController;
+    span: Span;
+  }): Promise<void> {
+    speechHandle._agentTurnContext = otelContext.active();
+    span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
+    const localParticipant = this.agentSession._roomIO?.localParticipant;
+    if (localParticipant) {
+      setParticipantSpanAttributes(span, localParticipant);
+    }
     speechHandleStorage.enterWith(speechHandle);
     if (!this.realtimeSession) {
@@ -1476,6 +2269,12 @@ export class AgentActivity implements RecognitionHooks {
       throw new Error('llm is not a realtime model');
     }
+    // Store span for metrics recording when they arrive later
+    span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.llm.model);
+    if (this.realtimeSpans && ev.responseId) {
+      this.realtimeSpans.set(ev.responseId, span);
+    }
     this.logger.debug(
       { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
       'realtime generation started',
@@ -1496,14 +2295,21 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
     };
     const readMessages = async (
       abortController: AbortController,
-      outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
+      outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
     ) => {
+      replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
+        once: true,
+      });
       const forwardTasks: Array<Task<void>> = [];
       try {
         for await (const msg of ev.messageStream) {
@@ -1513,7 +2319,25 @@ export class AgentActivity implements RecognitionHooks {
             );
             break;
           }
-          const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
+          const msgModalities = msg.modalities ? await msg.modalities : undefined;
+          let ttsTextInput: ReadableStream<string | TimedString> | null = null;
+          let trTextInput: ReadableStream<string | TimedString>;
+          if (msgModalities && !msgModalities.includes('audio') && this.tts) {
+            if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
+              this.logger.warn(
+                'text response received from realtime API, falling back to use a TTS model.',
+              );
+            }
+            const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
+            ttsTextInput = _ttsTextInput;
+            trTextInput = _trTextInput;
+          } else {
+            trTextInput = msg.textStream;
+          }
+          const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
           let textOut: _TextOut | null = null;
           if (trNodeResult) {
             const [textForwardTask, _textOut] = performTextForwarding(
@@ -1524,30 +2348,57 @@ export class AgentActivity implements RecognitionHooks {
             forwardTasks.push(textForwardTask);
             textOut = _textOut;
           }
           let audioOut: _AudioOut | null = null;
           if (audioOutput) {
-            const realtimeAudio = await this.agent.realtimeAudioOutputNode(
-              msg.audioStream,
-              modelSettings,
-            );
-            if (realtimeAudio) {
-              const [forwardTask, _audioOut] = performAudioForwarding(
-                realtimeAudio,
-                audioOutput,
+            let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
+            if (ttsTextInput) {
+              const [ttsTask, ttsGenData] = performTTSInference(
+                (...args) => this.agent.ttsNode(...args),
+                ttsTextInput,
+                modelSettings,
                 abortController,
+                this.tts?.model,
+                this.tts?.provider,
+              );
+              tasks.push(ttsTask);
+              realtimeAudioResult = ttsGenData.audioStream;
+            } else if (msgModalities && msgModalities.includes('audio')) {
+              realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
+                msg.audioStream,
+                modelSettings,
+              );
+            } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
+              this.logger.error(
+                'Text message received from Realtime API with audio modality. ' +
+                  'This usually happens when text chat context is synced to the API. ' +
+                  'Try to add a TTS model as fallback or use text modality with TTS instead.',
               );
-              forwardTasks.push(forwardTask);
-              audioOut = _audioOut;
-              audioOut.firstFrameFut.await.finally(onFirstFrame);
             } else {
               this.logger.warn(
                 'audio output is enabled but neither tts nor realtime audio is available',
               );
             }
+            if (realtimeAudioResult) {
+              const [forwardTask, _audioOut] = performAudioForwarding(
+                realtimeAudioResult,
+                audioOutput,
+                abortController,
+              );
+              forwardTasks.push(forwardTask);
+              audioOut = _audioOut;
+              audioOut.firstFrameFut.await
+                .then((ts) => onFirstFrame(ts))
+                .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
+            }
           } else if (textOut) {
-            textOut.firstTextFut.await.finally(onFirstFrame);
+            textOut.firstTextFut.await
+              .then(() => onFirstFrame())
+              .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
           }
-          outputs.push([msg.messageId, textOut, audioOut]);
+          outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
         }
         await waitFor(forwardTasks);
       } catch (error) {
@@ -1557,11 +2408,13 @@ export class AgentActivity implements RecognitionHooks {
       }
     };
-    const messageOutputs: Array<[string, _TextOut | null, _AudioOut | null]> = [];
+    const messageOutputs: Array<
+      [string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
+    > = [];
     const tasks = [
       Task.from(
         (controller) => readMessages(controller, messageOutputs),
-        replyAbortController,
+        undefined,
         'AgentActivity.realtime_generation.read_messages',
       ),
     ];
@@ -1598,6 +2451,8 @@ export class AgentActivity implements RecognitionHooks {
     const onToolExecutionStarted = (f: FunctionCall) => {
       speechHandle._itemAdded([f]);
+      this.agent._chatCtx.items.push(f);
+      this.agentSession._toolItemsAdded([f]);
     };
     const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
@@ -1623,7 +2478,6 @@ export class AgentActivity implements RecognitionHooks {
     if (audioOutput) {
       await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
-      this.agentSession._updateAgentState('listening');
     }
     if (speechHandle.interrupted) {
@@ -1636,17 +2490,17 @@ export class AgentActivity implements RecognitionHooks {
       if (messageOutputs.length > 0) {
         // there should be only one message
-        const [msgId, textOut, audioOut] = messageOutputs[0]!;
+        const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
         let forwardedText = textOut?.text || '';
         if (audioOutput) {
           audioOutput.clearBuffer();
           const playbackEv = await audioOutput.waitForPlayout();
-          let playbackPosition = playbackEv.playbackPosition;
-          if (audioOut?.firstFrameFut.done) {
+          let playbackPositionInS = playbackEv.playbackPosition;
+          if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
             // playback EV is valid only if the first frame was already played
             this.logger.info(
-              { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+              { speech_id: speechHandle.id, playbackPositionInS },
               'playout interrupted',
             );
             if (playbackEv.synchronizedTranscript) {
@@ -1654,13 +2508,15 @@ export class AgentActivity implements RecognitionHooks {
             }
           } else {
             forwardedText = '';
-            playbackPosition = 0;
+            playbackPositionInS = 0;
           }
           // truncate server-side message
           this.realtimeSession.truncate({
             messageId: msgId,
-            audioEndMs: Math.floor(playbackPosition),
+            audioEndMs: Math.floor(playbackPositionInS * 1000),
+            modalities: msgModalities,
+            audioTranscript: forwardedText,
           });
         }
@@ -1691,7 +2547,7 @@ export class AgentActivity implements RecognitionHooks {
     if (messageOutputs.length > 0) {
       // there should be only one message
-      const [msgId, textOut, _] = messageOutputs[0]!;
+      const [msgId, textOut, _, __] = messageOutputs[0]!;
       const message = ChatMessage.create({
         role: 'assistant',
         content: textOut?.text || '',
@@ -1708,16 +2564,20 @@ export class AgentActivity implements RecognitionHooks {
     speechHandle._markGenerationDone();
     // TODO(brian): close tees
-    toolOutput.firstToolStartedFuture.await.finally(() => {
-      this.agentSession._updateAgentState('thinking');
-    });
     await executeToolsTask.result;
-    if (toolOutput.output.length === 0) return;
+    if (toolOutput.output.length > 0) {
+      this.agentSession._updateAgentState('thinking');
+    } else if (this.agentSession.agentState === 'speaking') {
+      this.agentSession._updateAgentState('listening');
+    }
+    if (toolOutput.output.length === 0) {
+      return;
+    }
     // important: no agent ouput should be used after this point
-    const { maxToolSteps } = this.agentSession.options;
+    const { maxToolSteps } = this.agentSession.sessionOptions;
     if (speechHandle.numSteps >= maxToolSteps) {
       this.logger.warn(
         { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -1726,55 +2586,42 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
-      functionCalls: [],
-      functionCallOutputs: [],
-    });
-    let shouldGenerateToolReply: boolean = false;
-    let newAgentTask: Agent | null = null;
-    let ignoreTaskSwitch: boolean = false;
-    for (const sanitizedOut of toolOutput.output) {
-      if (sanitizedOut.toolCallOutput !== undefined) {
-        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
-        if (sanitizedOut.replyRequired) {
-          shouldGenerateToolReply = true;
-        }
-      }
-      if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
-        this.logger.error('expected to receive only one agent task from the tool executions');
-        ignoreTaskSwitch = true;
-      }
-      newAgentTask = sanitizedOut.agentTask ?? null;
-      this.logger.debug(
-        {
-          speechId: speechHandle.id,
-          name: sanitizedOut.toolCall?.name,
-          args: sanitizedOut.toolCall.args,
-          output: sanitizedOut.toolCallOutput?.output,
-          isError: sanitizedOut.toolCallOutput?.isError,
-        },
-        'Tool call execution finished',
-      );
-    }
+    const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
+      this.summarizeToolExecutionOutput(toolOutput, speechHandle);
     this.agentSession.emit(
       AgentSessionEventTypes.FunctionToolsExecuted,
       functionToolsExecutedEvent,
     );
-    let draining = this.draining;
+    let schedulingPaused = this.schedulingPaused;
     if (!ignoreTaskSwitch && newAgentTask !== null) {
       this.agentSession.updateAgent(newAgentTask);
-      draining = true;
+      schedulingPaused = true;
     }
     if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
+      // wait all speeches played before updating the tool output and generating the response
+      // most realtime models dont support generating multiple responses at the same time
+      while (this.currentSpeech || this.speechQueue.size() > 0) {
+        if (
+          this.currentSpeech &&
+          !this.currentSpeech.done() &&
+          this.currentSpeech !== speechHandle
+        ) {
+          await this.currentSpeech.waitForPlayout();
+        } else {
+          // Don't block the event loop
+          await new Promise((resolve) => setImmediate(resolve));
+        }
+      }
       const chatCtx = this.realtimeSession.chatCtx.copy();
       chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
+      this.agentSession._toolItemsAdded(
+        functionToolsExecutedEvent.functionCallOutputs as FunctionCallOutput[],
+      );
       try {
         await this.realtimeSession.updateChatCtx(chatCtx);
       } catch (error) {
@@ -1806,15 +2653,14 @@ export class AgentActivity implements RecognitionHooks {
       }),
     );
-    const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
+    const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
     this.createSpeechTask({
-      task: Task.from((abortController: AbortController) =>
+      taskFn: (abortController: AbortController) =>
         this.realtimeReplyTask({
           speechHandle: replySpeechHandle,
           modelSettings: { toolChoice },
           abortController,
         }),
-      ),
       ownedSpeechHandle: replySpeechHandle,
       name: 'AgentActivity.realtime_reply',
     });
@@ -1822,6 +2668,53 @@ export class AgentActivity implements RecognitionHooks {
     this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
   }
+  private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
+    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
+      functionCalls: [],
+      functionCallOutputs: [],
+    });
+    let shouldGenerateToolReply = false;
+    let newAgentTask: Agent | null = null;
+    let ignoreTaskSwitch = false;
+    for (const sanitizedOut of toolOutput.output) {
+      if (sanitizedOut.toolCallOutput !== undefined) {
+        // Keep event payload symmetric for pipeline + realtime paths.
+        functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
+        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
+        if (sanitizedOut.replyRequired) {
+          shouldGenerateToolReply = true;
+        }
+      }
+      if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
+        this.logger.error('expected to receive only one agent task from the tool executions');
+        ignoreTaskSwitch = true;
+      }
+      newAgentTask = sanitizedOut.agentTask ?? null;
+      this.logger.debug(
+        {
+          speechId: speechHandle.id,
+          name: sanitizedOut.toolCall?.name,
+          args: sanitizedOut.toolCall.args,
+          output: sanitizedOut.toolCallOutput?.output,
+          isError: sanitizedOut.toolCallOutput?.isError,
+        },
+        'Tool call execution finished',
+      );
+    }
+    return {
+      functionToolsExecutedEvent,
+      shouldGenerateToolReply,
+      newAgentTask,
+      ignoreTaskSwitch,
+    };
+  }
   private async realtimeReplyTask({
     speechHandle,
     modelSettings: { toolChoice },
@@ -1880,10 +2773,10 @@ export class AgentActivity implements RecognitionHooks {
     priority: number,
     force: boolean = false,
   ): void {
-    // when force=true, we allow tool responses to bypass draining
+    // when force=true, we allow tool responses to bypass scheduling pause
     // This allows for tool responses to be generated before the AgentActivity is finalized
-    if (this.draining && !force) {
-      throw new Error('cannot schedule new speech, the agent is draining');
+    if (this.schedulingPaused && !force) {
+      throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
     }
     // Monotonic time to avoid near 0 collisions
@@ -1892,19 +2785,77 @@ export class AgentActivity implements RecognitionHooks {
     this.wakeupMainTask();
   }
+  private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
+    if (this._schedulingPaused) return;
+    this._schedulingPaused = true;
+    this._drainBlockedTasks = blockedTasks;
+    this.wakeupMainTask();
+    if (this._mainTask) {
+      // When pausing/draining, we ensure that all speech_tasks complete fully.
+      // This means that even if the SpeechHandle themselves have finished,
+      // we still wait for the entire execution (e.g function_tools)
+      await this._mainTask.result;
+    }
+  }
+  private _resumeSchedulingTask(): void {
+    if (!this._schedulingPaused) return;
+    this._schedulingPaused = false;
+    this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
+  }
+  async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
+    const { blockedTasks = [] } = options;
+    const unlock = await this.lock.lock();
+    try {
+      const span = tracer.startSpan({
+        name: 'pause_agent_activity',
+        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+      });
+      try {
+        await this._pauseSchedulingTask(blockedTasks);
+        await this._closeSessionResources();
+      } finally {
+        span.end();
+      }
+    } finally {
+      unlock();
+    }
+  }
   async drain(): Promise<void> {
+    // Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
+    return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
+      name: 'drain_agent_activity',
+      context: ROOT_CONTEXT,
+    });
+  }
+  private async _drainImpl(span: Span): Promise<void> {
+    span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
     const unlock = await this.lock.lock();
     try {
-      if (this._draining) return;
+      if (this._schedulingPaused) return;
-      this.createSpeechTask({
-        task: Task.from(() => this.agent.onExit()),
+      this._onExitTask = this.createSpeechTask({
+        taskFn: () =>
+          tracer.startActiveSpan(async () => this.agent.onExit(), {
+            name: 'on_exit',
+            attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+          }),
+        inlineTask: true,
         name: 'AgentActivity_onExit',
       });
-      this.wakeupMainTask();
-      this._draining = true;
-      await this._mainTask?.result;
+      this.cancelPreemptiveGeneration();
+      await this._onExitTask.result;
+      await this._pauseSchedulingTask([]);
     } finally {
       unlock();
     }
@@ -1913,42 +2864,160 @@ export class AgentActivity implements RecognitionHooks {
   async close(): Promise<void> {
     const unlock = await this.lock.lock();
     try {
-      if (!this._draining) {
-        this.logger.warn('task closing without draining');
-      }
+      this.cancelPreemptiveGeneration();
-      // Unregister event handlers to prevent duplicate metrics
-      if (this.llm instanceof LLM) {
-        this.llm.off('metrics_collected', this.onMetricsCollected);
-      }
-      if (this.realtimeSession) {
-        this.realtimeSession.off('generation_created', this.onGenerationCreated);
-        this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
-        this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
-        this.realtimeSession.off(
-          'input_audio_transcription_completed',
-          this.onInputAudioTranscriptionCompleted,
-        );
-        this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
-      }
-      if (this.stt instanceof STT) {
-        this.stt.off('metrics_collected', this.onMetricsCollected);
+      await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
+      if (this._currentSpeech && !this._currentSpeech.done()) {
+        this._currentSpeech._markDone();
       }
-      if (this.tts instanceof TTS) {
-        this.tts.off('metrics_collected', this.onMetricsCollected);
+      await this._closeSessionResources();
+      if (this._mainTask) {
+        await this._mainTask.cancelAndWait();
       }
-      if (this.vad instanceof VAD) {
-        this.vad.off('metrics_collected', this.onMetricsCollected);
+      if (this.interruptionDetector) {
+        this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
+        this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
+        this.interruptionDetector.off('error', this.onInterruptionError);
       }
-      this.detachAudioInput();
-      await this.realtimeSession?.close();
-      await this.audioRecognition?.close();
-      await this._mainTask?.cancelAndWait();
+      this.agent._agentActivity = undefined;
     } finally {
       unlock();
     }
   }
+  private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
+    const agentInterruptionDetection = this.agent.turnHandling?.interruption?.mode;
+    const sessionInterruptionDetection = this.agentSession.interruptionDetection;
+    if (
+      !(
+        this.stt &&
+        this.stt.capabilities.alignedTranscript &&
+        this.stt.capabilities.streaming &&
+        this.vad &&
+        this.turnDetection !== 'manual' &&
+        this.turnDetection !== 'realtime_llm' &&
+        !(this.llm instanceof RealtimeModel)
+      )
+    ) {
+      if (
+        agentInterruptionDetection === 'adaptive' ||
+        sessionInterruptionDetection === 'adaptive'
+      ) {
+        this.logger.warn(
+          "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
+        );
+      }
+      return undefined;
+    }
+    if (!this.allowInterruptions) {
+      return undefined;
+    }
+    if (agentInterruptionDetection === 'vad') {
+      return undefined;
+    }
+    if (sessionInterruptionDetection === 'vad') {
+      return undefined;
+    }
+    if (
+      agentInterruptionDetection === undefined &&
+      sessionInterruptionDetection === undefined &&
+      !isHosted() &&
+      !isDevMode()
+    ) {
+      this.logger.info('adaptive interruption is disabled by default in production mode');
+      return undefined;
+    }
+    try {
+      const detector = new AdaptiveInterruptionDetector();
+      detector.on('overlapping_speech', this.onInterruptionOverlappingSpeech);
+      detector.on('metrics_collected', this.onInterruptionMetricsCollected);
+      detector.on('error', this.onInterruptionError);
+      return detector;
+    } catch (error: unknown) {
+      this.logger.warn({ error }, 'could not instantiate AdaptiveInterruptionDetector');
+    }
+    return undefined;
+  }
+  private restoreInterruptionByAudioActivity(): void {
+    this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
+  }
+  private fallbackToVadInterruption(): void {
+    if (!this.isInterruptionDetectionEnabled) return;
+    this.isInterruptionDetectionEnabled = false;
+    this.restoreInterruptionByAudioActivity();
+    if (this.interruptionDetector) {
+      this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
+      this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
+      this.interruptionDetector.off('error', this.onInterruptionError);
+      this.interruptionDetector = undefined;
+    }
+    if (this.audioRecognition) {
+      this.audioRecognition.disableInterruptionDetection().catch((err) => {
+        this.logger.warn({ err }, 'error while disabling interruption detection');
+      });
+    }
+    this.logger.warn(
+      'adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption',
+    );
+  }
+  private async _closeSessionResources(): Promise<void> {
+    // Unregister event handlers to prevent duplicate metrics
+    if (this.llm instanceof LLM) {
+      this.llm.off('metrics_collected', this.onMetricsCollected);
+      this.llm.off('error', this.onModelError);
+    }
+    if (this.realtimeSession) {
+      this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
+      this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
+      this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
+      this.realtimeSession.off(
+        'input_audio_transcription_completed',
+        this.onRealtimeInputAudioTranscriptionCompleted,
+      );
+      this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
+      this.realtimeSession.off('error', this.onModelError);
+    }
+    if (this.stt instanceof STT) {
+      this.stt.off('metrics_collected', this.onMetricsCollected);
+      this.stt.off('error', this.onModelError);
+    }
+    if (this.tts instanceof TTS) {
+      this.tts.off('metrics_collected', this.onMetricsCollected);
+      this.tts.off('error', this.onModelError);
+    }
+    if (this.vad instanceof VAD) {
+      this.vad.off('metrics_collected', this.onMetricsCollected);
+    }
+    this.detachAudioInput();
+    this.realtimeSpans?.clear();
+    await this.realtimeSession?.close();
+    await this.audioRecognition?.close();
+    this.realtimeSession = undefined;
+    this.audioRecognition = undefined;
+  }
 }
 function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {