@livekit/agents 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_exceptions.cjs.map +1 -1
- package/dist/_exceptions.d.ts.map +1 -1
- package/dist/_exceptions.js.map +1 -1
- package/dist/audio.cjs +89 -3
- package/dist/audio.cjs.map +1 -1
- package/dist/audio.d.cts +36 -1
- package/dist/audio.d.ts +36 -1
- package/dist/audio.d.ts.map +1 -1
- package/dist/audio.js +76 -2
- package/dist/audio.js.map +1 -1
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +165 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +141 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/cli.cjs +44 -46
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +45 -47
- package/dist/cli.js.map +1 -1
- package/dist/connection_pool.cjs +242 -0
- package/dist/connection_pool.cjs.map +1 -0
- package/dist/connection_pool.d.cts +123 -0
- package/dist/connection_pool.d.ts +123 -0
- package/dist/connection_pool.d.ts.map +1 -0
- package/dist/connection_pool.js +218 -0
- package/dist/connection_pool.js.map +1 -0
- package/dist/connection_pool.test.cjs +256 -0
- package/dist/connection_pool.test.cjs.map +1 -0
- package/dist/connection_pool.test.js +255 -0
- package/dist/connection_pool.test.js.map +1 -0
- package/dist/constants.cjs +30 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +10 -0
- package/dist/constants.d.ts +10 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +20 -0
- package/dist/constants.js.map +1 -1
- package/dist/cpu.cjs +189 -0
- package/dist/cpu.cjs.map +1 -0
- package/dist/cpu.d.cts +24 -0
- package/dist/cpu.d.ts +24 -0
- package/dist/cpu.d.ts.map +1 -0
- package/dist/cpu.js +152 -0
- package/dist/cpu.js.map +1 -0
- package/dist/cpu.test.cjs +227 -0
- package/dist/cpu.test.cjs.map +1 -0
- package/dist/cpu.test.js +204 -0
- package/dist/cpu.test.js.map +1 -0
- package/dist/http_server.cjs +9 -6
- package/dist/http_server.cjs.map +1 -1
- package/dist/http_server.d.cts +5 -1
- package/dist/http_server.d.ts +5 -1
- package/dist/http_server.d.ts.map +1 -1
- package/dist/http_server.js +9 -6
- package/dist/http_server.js.map +1 -1
- package/dist/index.cjs +24 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +15 -11
- package/dist/index.d.ts +15 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +18 -9
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.cjs +70 -2
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +373 -32
- package/dist/inference/api_protos.d.ts +373 -32
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +62 -2
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/index.cjs +8 -0
- package/dist/inference/index.cjs.map +1 -1
- package/dist/inference/index.d.cts +3 -4
- package/dist/inference/index.d.ts +3 -4
- package/dist/inference/index.d.ts.map +1 -1
- package/dist/inference/index.js +18 -3
- package/dist/inference/index.js.map +1 -1
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +163 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +65 -0
- package/dist/inference/interruption/http_transport.d.ts +65 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +137 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +198 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +164 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +347 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +313 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +106 -66
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +65 -43
- package/dist/inference/llm.d.ts +65 -43
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +100 -66
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +319 -170
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +64 -15
- package/dist/inference/stt.d.ts +64 -15
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +319 -170
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/stt.test.cjs +218 -0
- package/dist/inference/stt.test.cjs.map +1 -0
- package/dist/inference/stt.test.js +217 -0
- package/dist/inference/stt.test.js.map +1 -0
- package/dist/inference/tts.cjs +249 -71
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +94 -17
- package/dist/inference/tts.d.ts +94 -17
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +249 -77
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/tts.test.cjs +305 -0
- package/dist/inference/tts.test.cjs.map +1 -0
- package/dist/inference/tts.test.js +304 -0
- package/dist/inference/tts.test.js.map +1 -0
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +14 -1
- package/dist/inference/utils.d.ts +14 -1
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs +6 -3
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
- package/dist/ipc/inference_proc_executor.js +6 -3
- package/dist/ipc/inference_proc_executor.js.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.js +13 -1
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
- package/dist/ipc/job_proc_executor.cjs +6 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.d.ts.map +1 -1
- package/dist/ipc/job_proc_executor.js +6 -1
- package/dist/ipc/job_proc_executor.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +89 -17
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +68 -18
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.cjs +34 -8
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.d.cts +8 -0
- package/dist/ipc/supervised_proc.d.ts +8 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/ipc/supervised_proc.js +34 -8
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/ipc/supervised_proc.test.cjs +145 -0
- package/dist/ipc/supervised_proc.test.cjs.map +1 -0
- package/dist/ipc/supervised_proc.test.js +122 -0
- package/dist/ipc/supervised_proc.test.js.map +1 -0
- package/dist/job.cjs +109 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +14 -0
- package/dist/job.d.ts +14 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +99 -1
- package/dist/job.js.map +1 -1
- package/dist/language.cjs +394 -0
- package/dist/language.cjs.map +1 -0
- package/dist/language.d.cts +15 -0
- package/dist/language.d.ts +15 -0
- package/dist/language.d.ts.map +1 -0
- package/dist/language.js +363 -0
- package/dist/language.js.map +1 -0
- package/dist/language.test.cjs +43 -0
- package/dist/language.test.cjs.map +1 -0
- package/dist/language.test.js +49 -0
- package/dist/language.test.js.map +1 -0
- package/dist/llm/chat_context.cjs +345 -3
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +86 -2
- package/dist/llm/chat_context.d.ts +86 -2
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +344 -3
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +692 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +692 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/fallback_adapter.cjs +280 -0
- package/dist/llm/fallback_adapter.cjs.map +1 -0
- package/dist/llm/fallback_adapter.d.cts +73 -0
- package/dist/llm/fallback_adapter.d.ts +73 -0
- package/dist/llm/fallback_adapter.d.ts.map +1 -0
- package/dist/llm/fallback_adapter.js +256 -0
- package/dist/llm/fallback_adapter.js.map +1 -0
- package/dist/llm/fallback_adapter.test.cjs +176 -0
- package/dist/llm/fallback_adapter.test.cjs.map +1 -0
- package/dist/llm/fallback_adapter.test.js +175 -0
- package/dist/llm/fallback_adapter.test.js.map +1 -0
- package/dist/llm/index.cjs +11 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +4 -3
- package/dist/llm/index.d.ts +4 -3
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +13 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +65 -11
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +13 -2
- package/dist/llm/llm.d.ts +13 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +65 -11
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/google.cjs +6 -2
- package/dist/llm/provider_format/google.cjs.map +1 -1
- package/dist/llm/provider_format/google.d.cts +1 -1
- package/dist/llm/provider_format/google.d.ts +1 -1
- package/dist/llm/provider_format/google.d.ts.map +1 -1
- package/dist/llm/provider_format/google.js +6 -2
- package/dist/llm/provider_format/google.js.map +1 -1
- package/dist/llm/provider_format/google.test.cjs +48 -0
- package/dist/llm/provider_format/google.test.cjs.map +1 -1
- package/dist/llm/provider_format/google.test.js +54 -1
- package/dist/llm/provider_format/google.test.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +2 -2
- package/dist/llm/provider_format/index.d.ts +2 -2
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +126 -24
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +124 -23
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +393 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +400 -2
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +5 -4
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +5 -4
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +15 -1
- package/dist/llm/realtime.d.ts +15 -1
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/remote_chat_context.cjs.map +1 -1
- package/dist/llm/remote_chat_context.d.cts +2 -0
- package/dist/llm/remote_chat_context.d.ts +2 -0
- package/dist/llm/remote_chat_context.d.ts.map +1 -1
- package/dist/llm/remote_chat_context.js.map +1 -1
- package/dist/llm/tool_context.cjs +50 -2
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +47 -11
- package/dist/llm/tool_context.d.ts +47 -11
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +48 -3
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/llm/tool_context.test.cjs +197 -0
- package/dist/llm/tool_context.test.cjs.map +1 -1
- package/dist/llm/tool_context.test.js +175 -0
- package/dist/llm/tool_context.test.js.map +1 -1
- package/dist/llm/utils.cjs +107 -12
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +10 -3
- package/dist/llm/utils.d.ts +10 -3
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +106 -12
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/utils.test.cjs +90 -0
- package/dist/llm/utils.test.cjs.map +1 -1
- package/dist/llm/utils.test.js +98 -2
- package/dist/llm/utils.test.js.map +1 -1
- package/dist/llm/zod-utils.cjs +102 -0
- package/dist/llm/zod-utils.cjs.map +1 -0
- package/dist/llm/zod-utils.d.cts +65 -0
- package/dist/llm/zod-utils.d.ts +65 -0
- package/dist/llm/zod-utils.d.ts.map +1 -0
- package/dist/llm/zod-utils.js +64 -0
- package/dist/llm/zod-utils.js.map +1 -0
- package/dist/llm/zod-utils.test.cjs +472 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -0
- package/dist/llm/zod-utils.test.js +455 -0
- package/dist/llm/zod-utils.test.js.map +1 -0
- package/dist/log.cjs +45 -14
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.cts +8 -1
- package/dist/log.d.ts +8 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +45 -15
- package/dist/log.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +75 -19
- package/dist/metrics/base.d.ts +75 -19
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +5 -2
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +10 -1
- package/dist/metrics/usage_collector.d.ts +10 -1
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +5 -2
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +23 -7
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +23 -7
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +31 -10
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.cts +6 -1
- package/dist/stream/deferred_stream.d.ts +6 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +31 -10
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/deferred_stream.test.cjs +2 -2
- package/dist/stream/deferred_stream.test.cjs.map +1 -1
- package/dist/stream/deferred_stream.test.js +2 -2
- package/dist/stream/deferred_stream.test.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +344 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +343 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/stream/stream_channel.cjs +39 -1
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +5 -2
- package/dist/stream/stream_channel.d.ts +5 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +39 -1
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stream/stream_channel.test.cjs +27 -0
- package/dist/stream/stream_channel.test.cjs.map +1 -1
- package/dist/stream/stream_channel.test.js +27 -0
- package/dist/stream/stream_channel.test.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +24 -9
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.cts +7 -3
- package/dist/stt/stream_adapter.d.ts +7 -3
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +24 -9
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +94 -19
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +68 -5
- package/dist/stt/stt.d.ts +68 -5
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +96 -21
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/index.cjs +72 -0
- package/dist/telemetry/index.cjs.map +1 -0
- package/dist/telemetry/index.d.cts +7 -0
- package/dist/telemetry/index.d.ts +7 -0
- package/dist/telemetry/index.d.ts.map +1 -0
- package/dist/telemetry/index.js +37 -0
- package/dist/telemetry/index.js.map +1 -0
- package/dist/telemetry/logging.cjs +65 -0
- package/dist/telemetry/logging.cjs.map +1 -0
- package/dist/telemetry/logging.d.cts +21 -0
- package/dist/telemetry/logging.d.ts +21 -0
- package/dist/telemetry/logging.d.ts.map +1 -0
- package/dist/telemetry/logging.js +40 -0
- package/dist/telemetry/logging.js.map +1 -0
- package/dist/telemetry/otel_http_exporter.cjs +166 -0
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
- package/dist/telemetry/otel_http_exporter.d.cts +63 -0
- package/dist/telemetry/otel_http_exporter.d.ts +63 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
- package/dist/telemetry/otel_http_exporter.js +142 -0
- package/dist/telemetry/otel_http_exporter.js.map +1 -0
- package/dist/telemetry/pino_otel_transport.cjs +217 -0
- package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
- package/dist/telemetry/pino_otel_transport.d.cts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
- package/dist/telemetry/pino_otel_transport.js +189 -0
- package/dist/telemetry/pino_otel_transport.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +233 -0
- package/dist/telemetry/trace_types.cjs.map +1 -0
- package/dist/telemetry/trace_types.d.cts +74 -0
- package/dist/telemetry/trace_types.d.ts +74 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -0
- package/dist/telemetry/trace_types.js +141 -0
- package/dist/telemetry/trace_types.js.map +1 -0
- package/dist/telemetry/traces.cjs +484 -0
- package/dist/telemetry/traces.cjs.map +1 -0
- package/dist/telemetry/traces.d.cts +116 -0
- package/dist/telemetry/traces.d.ts +116 -0
- package/dist/telemetry/traces.d.ts.map +1 -0
- package/dist/telemetry/traces.js +449 -0
- package/dist/telemetry/traces.js.map +1 -0
- package/dist/telemetry/utils.cjs +86 -0
- package/dist/telemetry/utils.cjs.map +1 -0
- package/dist/telemetry/utils.d.cts +5 -0
- package/dist/telemetry/utils.d.ts +5 -0
- package/dist/telemetry/utils.d.ts.map +1 -0
- package/dist/telemetry/utils.js +51 -0
- package/dist/telemetry/utils.js.map +1 -0
- package/dist/tokenize/basic/sentence.cjs +3 -3
- package/dist/tokenize/basic/sentence.cjs.map +1 -1
- package/dist/tokenize/basic/sentence.js +3 -3
- package/dist/tokenize/basic/sentence.js.map +1 -1
- package/dist/tokenize/tokenizer.test.cjs +3 -1
- package/dist/tokenize/tokenizer.test.cjs.map +1 -1
- package/dist/tokenize/tokenizer.test.js +3 -1
- package/dist/tokenize/tokenizer.test.js.map +1 -1
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.cts +6 -0
- package/dist/transcription.d.ts +6 -0
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js.map +1 -1
- package/dist/tts/fallback_adapter.cjs +472 -0
- package/dist/tts/fallback_adapter.cjs.map +1 -0
- package/dist/tts/fallback_adapter.d.cts +110 -0
- package/dist/tts/fallback_adapter.d.ts +110 -0
- package/dist/tts/fallback_adapter.d.ts.map +1 -0
- package/dist/tts/fallback_adapter.js +448 -0
- package/dist/tts/fallback_adapter.js.map +1 -0
- package/dist/tts/index.cjs +3 -0
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +1 -0
- package/dist/tts/index.d.ts +1 -0
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +2 -0
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +25 -8
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +6 -3
- package/dist/tts/stream_adapter.d.ts +6 -3
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +25 -8
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +189 -57
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +58 -6
- package/dist/tts/tts.d.ts +58 -6
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +191 -59
- package/dist/tts/tts.js.map +1 -1
- package/dist/typed_promise.cjs +48 -0
- package/dist/typed_promise.cjs.map +1 -0
- package/dist/typed_promise.d.cts +24 -0
- package/dist/typed_promise.d.ts +24 -0
- package/dist/typed_promise.d.ts.map +1 -0
- package/dist/typed_promise.js +28 -0
- package/dist/typed_promise.js.map +1 -0
- package/dist/types.cjs +24 -32
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +45 -10
- package/dist/types.d.ts +45 -10
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +20 -30
- package/dist/types.js.map +1 -1
- package/dist/utils.cjs +124 -28
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +41 -1
- package/dist/utils.d.ts +41 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +119 -27
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +73 -1
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +74 -10
- package/dist/utils.test.js.map +1 -1
- package/dist/vad.cjs +35 -15
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +15 -5
- package/dist/vad.d.ts +15 -5
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +35 -15
- package/dist/vad.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +258 -35
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +54 -13
- package/dist/voice/agent.d.ts +54 -13
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +254 -34
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +314 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +316 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +1116 -385
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +72 -11
- package/dist/voice/agent_activity.d.ts +72 -11
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +1119 -383
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_activity.test.cjs +135 -0
- package/dist/voice/agent_activity.test.cjs.map +1 -0
- package/dist/voice/agent_activity.test.js +134 -0
- package/dist/voice/agent_activity.test.js.map +1 -0
- package/dist/voice/agent_session.cjs +550 -90
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +185 -25
- package/dist/voice/agent_session.d.ts +185 -25
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +556 -91
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +605 -46
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +96 -4
- package/dist/voice/audio_recognition.d.ts +96 -4
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +611 -47
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +295 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +299 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/avatar/datastream_io.cjs +7 -1
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +7 -1
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs +367 -0
- package/dist/voice/background_audio.cjs.map +1 -0
- package/dist/voice/background_audio.d.cts +123 -0
- package/dist/voice/background_audio.d.ts +123 -0
- package/dist/voice/background_audio.d.ts.map +1 -0
- package/dist/voice/background_audio.js +343 -0
- package/dist/voice/background_audio.js.map +1 -0
- package/dist/voice/events.cjs +3 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +16 -9
- package/dist/voice/events.d.ts +16 -9
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +3 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +205 -41
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +21 -5
- package/dist/voice/generation.d.ts +21 -5
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +215 -43
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/generation_tools.test.cjs +236 -0
- package/dist/voice/generation_tools.test.cjs.map +1 -0
- package/dist/voice/generation_tools.test.js +235 -0
- package/dist/voice/generation_tools.test.js.map +1 -0
- package/dist/voice/index.cjs +33 -2
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +8 -2
- package/dist/voice/index.d.ts +8 -2
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +19 -2
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/interruption_detection.test.cjs +114 -0
- package/dist/voice/interruption_detection.test.cjs.map +1 -0
- package/dist/voice/interruption_detection.test.js +113 -0
- package/dist/voice/interruption_detection.test.js.map +1 -0
- package/dist/voice/io.cjs +66 -6
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +67 -7
- package/dist/voice/io.d.ts +67 -7
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +62 -5
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/index.cjs +23 -0
- package/dist/voice/recorder_io/index.cjs.map +1 -0
- package/dist/voice/recorder_io/index.d.cts +2 -0
- package/dist/voice/recorder_io/index.d.ts +2 -0
- package/dist/voice/recorder_io/index.d.ts.map +1 -0
- package/dist/voice/recorder_io/index.js +2 -0
- package/dist/voice/recorder_io/index.js.map +1 -0
- package/dist/voice/recorder_io/recorder_io.cjs +607 -0
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
- package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
- package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
- package/dist/voice/recorder_io/recorder_io.js +573 -0
- package/dist/voice/recorder_io/recorder_io.js.map +1 -0
- package/dist/voice/remote_session.cjs +922 -0
- package/dist/voice/remote_session.cjs.map +1 -0
- package/dist/voice/remote_session.d.cts +108 -0
- package/dist/voice/remote_session.d.ts +108 -0
- package/dist/voice/remote_session.d.ts.map +1 -0
- package/dist/voice/remote_session.js +887 -0
- package/dist/voice/remote_session.js.map +1 -0
- package/dist/voice/report.cjs +88 -0
- package/dist/voice/report.cjs.map +1 -0
- package/dist/voice/report.d.cts +49 -0
- package/dist/voice/report.d.ts +49 -0
- package/dist/voice/report.d.ts.map +1 -0
- package/dist/voice/report.js +63 -0
- package/dist/voice/report.js.map +1 -0
- package/dist/voice/report.test.cjs +121 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +120 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/_input.cjs +40 -7
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +5 -2
- package/dist/voice/room_io/_input.d.ts +5 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +41 -8
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +19 -11
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +7 -4
- package/dist/voice/room_io/_output.d.ts +7 -4
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +20 -12
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +33 -6
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +29 -9
- package/dist/voice/room_io/room_io.d.ts +29 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +33 -7
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +22 -4
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +17 -2
- package/dist/voice/speech_handle.d.ts +17 -2
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +21 -4
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +57 -0
- package/dist/voice/testing/index.cjs.map +1 -0
- package/dist/voice/testing/index.d.cts +21 -0
- package/dist/voice/testing/index.d.ts +21 -0
- package/dist/voice/testing/index.d.ts.map +1 -0
- package/dist/voice/testing/index.js +35 -0
- package/dist/voice/testing/index.js.map +1 -0
- package/dist/voice/testing/run_result.cjs +817 -0
- package/dist/voice/testing/run_result.cjs.map +1 -0
- package/dist/voice/testing/run_result.d.cts +385 -0
- package/dist/voice/testing/run_result.d.ts +385 -0
- package/dist/voice/testing/run_result.d.ts.map +1 -0
- package/dist/voice/testing/run_result.js +790 -0
- package/dist/voice/testing/run_result.js.map +1 -0
- package/dist/voice/testing/types.cjs +46 -0
- package/dist/voice/testing/types.cjs.map +1 -0
- package/dist/voice/testing/types.d.cts +83 -0
- package/dist/voice/testing/types.d.ts +83 -0
- package/dist/voice/testing/types.d.ts.map +1 -0
- package/dist/voice/testing/types.js +19 -0
- package/dist/voice/testing/types.js.map +1 -0
- package/dist/voice/transcription/synchronizer.cjs +139 -15
- package/dist/voice/transcription/synchronizer.cjs.map +1 -1
- package/dist/voice/transcription/synchronizer.d.cts +35 -4
- package/dist/voice/transcription/synchronizer.d.ts +35 -4
- package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
- package/dist/voice/transcription/synchronizer.js +143 -16
- package/dist/voice/transcription/synchronizer.js.map +1 -1
- package/dist/voice/transcription/synchronizer.test.cjs +151 -0
- package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
- package/dist/voice/transcription/synchronizer.test.js +150 -0
- package/dist/voice/transcription/synchronizer.test.js.map +1 -0
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +157 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +37 -0
- package/dist/voice/turn_config/utils.d.ts +37 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +131 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +128 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +127 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/dist/worker.cjs +44 -52
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +18 -8
- package/dist/worker.d.ts +18 -8
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +43 -43
- package/dist/worker.js.map +1 -1
- package/package.json +35 -13
- package/resources/NOTICE +2 -0
- package/resources/keyboard-typing.ogg +0 -0
- package/resources/keyboard-typing2.ogg +0 -0
- package/resources/office-ambience.ogg +0 -0
- package/src/_exceptions.ts +5 -0
- package/src/audio.ts +132 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +203 -0
- package/src/cli.ts +57 -66
- package/src/connection_pool.test.ts +346 -0
- package/src/connection_pool.ts +307 -0
- package/src/constants.ts +14 -0
- package/src/cpu.test.ts +239 -0
- package/src/cpu.ts +173 -0
- package/src/http_server.ts +18 -6
- package/src/index.ts +15 -13
- package/src/inference/api_protos.ts +85 -2
- package/src/inference/index.ts +32 -4
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +207 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +204 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +416 -0
- package/src/inference/llm.ts +214 -163
- package/src/inference/stt.test.ts +253 -0
- package/src/inference/stt.ts +449 -208
- package/src/inference/tts.test.ts +354 -0
- package/src/inference/tts.ts +417 -115
- package/src/inference/utils.ts +30 -2
- package/src/ipc/inference_proc_executor.ts +11 -3
- package/src/ipc/inference_proc_lazy_main.ts +13 -1
- package/src/ipc/job_proc_executor.ts +11 -1
- package/src/ipc/job_proc_lazy_main.ts +86 -20
- package/src/ipc/supervised_proc.test.ts +153 -0
- package/src/ipc/supervised_proc.ts +39 -10
- package/src/job.ts +120 -1
- package/src/language.test.ts +62 -0
- package/src/language.ts +380 -0
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
- package/src/llm/chat_context.test.ts +787 -0
- package/src/llm/chat_context.ts +493 -2
- package/src/llm/fallback_adapter.test.ts +238 -0
- package/src/llm/fallback_adapter.ts +394 -0
- package/src/llm/index.ts +13 -0
- package/src/llm/llm.ts +77 -12
- package/src/llm/provider_format/google.test.ts +72 -1
- package/src/llm/provider_format/google.ts +10 -6
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +480 -2
- package/src/llm/provider_format/openai.ts +152 -21
- package/src/llm/provider_format/utils.ts +11 -5
- package/src/llm/realtime.ts +23 -2
- package/src/llm/remote_chat_context.ts +2 -2
- package/src/llm/tool_context.test.ts +210 -1
- package/src/llm/tool_context.ts +115 -17
- package/src/llm/utils.test.ts +103 -2
- package/src/llm/utils.ts +152 -16
- package/src/llm/zod-utils.test.ts +577 -0
- package/src/llm/zod-utils.ts +153 -0
- package/src/log.ts +71 -19
- package/src/metrics/base.ts +78 -19
- package/src/metrics/index.ts +12 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +14 -3
- package/src/metrics/utils.ts +27 -7
- package/src/stream/deferred_stream.test.ts +3 -3
- package/src/stream/deferred_stream.ts +43 -11
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +545 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/stream/stream_channel.test.ts +37 -0
- package/src/stream/stream_channel.ts +43 -3
- package/src/stt/stream_adapter.ts +30 -9
- package/src/stt/stt.ts +140 -23
- package/src/telemetry/index.ts +28 -0
- package/src/telemetry/logging.ts +55 -0
- package/src/telemetry/otel_http_exporter.ts +218 -0
- package/src/telemetry/pino_otel_transport.ts +265 -0
- package/src/telemetry/trace_types.ts +109 -0
- package/src/telemetry/traces.ts +673 -0
- package/src/telemetry/utils.ts +61 -0
- package/src/tokenize/basic/sentence.ts +3 -3
- package/src/tokenize/tokenizer.test.ts +4 -0
- package/src/transcription.ts +6 -0
- package/src/tts/fallback_adapter.ts +586 -0
- package/src/tts/index.ts +1 -0
- package/src/tts/stream_adapter.ts +38 -8
- package/src/tts/tts.ts +245 -62
- package/src/typed_promise.ts +67 -0
- package/src/types.ts +62 -33
- package/src/utils.test.ts +90 -10
- package/src/utils.ts +178 -33
- package/src/vad.ts +42 -18
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +347 -2
- package/src/voice/agent.ts +346 -44
- package/src/voice/agent_activity.test.ts +194 -0
- package/src/voice/agent_activity.ts +1457 -388
- package/src/voice/agent_session.ts +817 -112
- package/src/voice/audio_recognition.ts +845 -70
- package/src/voice/audio_recognition_span.test.ts +341 -0
- package/src/voice/avatar/datastream_io.ts +9 -1
- package/src/voice/background_audio.ts +494 -0
- package/src/voice/events.ts +27 -7
- package/src/voice/generation.ts +310 -56
- package/src/voice/generation_tools.test.ts +268 -0
- package/src/voice/index.ts +17 -3
- package/src/voice/interruption_detection.test.ts +151 -0
- package/src/voice/io.ts +115 -12
- package/src/voice/recorder_io/index.ts +4 -0
- package/src/voice/recorder_io/recorder_io.ts +783 -0
- package/src/voice/remote_session.ts +1083 -0
- package/src/voice/report.test.ts +136 -0
- package/src/voice/report.ts +140 -0
- package/src/voice/room_io/_input.ts +45 -10
- package/src/voice/room_io/_output.ts +26 -14
- package/src/voice/room_io/room_io.ts +67 -22
- package/src/voice/speech_handle.ts +38 -6
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +52 -0
- package/src/voice/testing/run_result.ts +995 -0
- package/src/voice/testing/types.ts +118 -0
- package/src/voice/transcription/synchronizer.test.ts +206 -0
- package/src/voice/transcription/synchronizer.ts +204 -19
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +148 -0
- package/src/voice/turn_config/utils.ts +167 -0
- package/src/voice/utils.ts +29 -0
- package/src/worker.ts +92 -78
- package/src/llm/__snapshots__/utils.test.ts.snap +0 -65
|
@@ -3,13 +3,19 @@
|
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import { Mutex } from '@livekit/mutex';
|
|
5
5
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
6
|
+
import type { Span } from '@opentelemetry/api';
|
|
7
|
+
import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
|
|
6
8
|
import { Heap } from 'heap-js';
|
|
7
9
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
8
|
-
import { ReadableStream } from 'node:stream/web';
|
|
9
|
-
import
|
|
10
|
+
import { ReadableStream, TransformStream } from 'node:stream/web';
|
|
11
|
+
import type { InterruptionDetectionError } from '../inference/interruption/errors.js';
|
|
12
|
+
import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
|
|
13
|
+
import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
|
|
14
|
+
import { type ChatContext, ChatMessage, type MetricsReport } from '../llm/chat_context.js';
|
|
10
15
|
import {
|
|
11
16
|
type ChatItem,
|
|
12
17
|
type FunctionCall,
|
|
18
|
+
type FunctionCallOutput,
|
|
13
19
|
type GenerationCreatedEvent,
|
|
14
20
|
type InputSpeechStartedEvent,
|
|
15
21
|
type InputSpeechStoppedEvent,
|
|
@@ -20,31 +26,41 @@ import {
|
|
|
20
26
|
type RealtimeSession,
|
|
21
27
|
type ToolChoice,
|
|
22
28
|
type ToolContext,
|
|
29
|
+
ToolFlag,
|
|
23
30
|
} from '../llm/index.js';
|
|
24
31
|
import type { LLMError } from '../llm/llm.js';
|
|
32
|
+
import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
|
|
25
33
|
import { log } from '../log.js';
|
|
26
34
|
import type {
|
|
27
35
|
EOUMetrics,
|
|
36
|
+
InterruptionMetrics,
|
|
28
37
|
LLMMetrics,
|
|
29
38
|
RealtimeModelMetrics,
|
|
30
39
|
STTMetrics,
|
|
31
40
|
TTSMetrics,
|
|
32
41
|
VADMetrics,
|
|
33
42
|
} from '../metrics/base.js';
|
|
34
|
-
import {
|
|
43
|
+
import { MultiInputStream } from '../stream/multi_input_stream.js';
|
|
35
44
|
import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
|
|
45
|
+
import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
|
|
36
46
|
import { splitWords } from '../tokenize/basic/word.js';
|
|
37
47
|
import { TTS, type TTSError } from '../tts/tts.js';
|
|
38
|
-
import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
|
|
48
|
+
import { Future, Task, cancelAndWait, isDevMode, isHosted, waitFor } from '../utils.js';
|
|
39
49
|
import { VAD, type VADEvent } from '../vad.js';
|
|
40
50
|
import type { Agent, ModelSettings } from './agent.js';
|
|
41
|
-
import {
|
|
51
|
+
import {
|
|
52
|
+
StopResponse,
|
|
53
|
+
_getActivityTaskInfo,
|
|
54
|
+
_setActivityTaskInfo,
|
|
55
|
+
functionCallStorage,
|
|
56
|
+
speechHandleStorage,
|
|
57
|
+
} from './agent.js';
|
|
42
58
|
import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
|
|
43
59
|
import {
|
|
44
60
|
AudioRecognition,
|
|
45
61
|
type EndOfTurnInfo,
|
|
62
|
+
type PreemptiveGenerationInfo,
|
|
46
63
|
type RecognitionHooks,
|
|
47
|
-
type _TurnDetector,
|
|
48
64
|
} from './audio_recognition.js';
|
|
49
65
|
import {
|
|
50
66
|
AgentSessionEventTypes,
|
|
@@ -54,7 +70,7 @@ import {
|
|
|
54
70
|
createSpeechCreatedEvent,
|
|
55
71
|
createUserInputTranscribedEvent,
|
|
56
72
|
} from './events.js';
|
|
57
|
-
import type { ToolExecutionOutput } from './generation.js';
|
|
73
|
+
import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
|
|
58
74
|
import {
|
|
59
75
|
type _AudioOut,
|
|
60
76
|
type _TextOut,
|
|
@@ -66,34 +82,105 @@ import {
|
|
|
66
82
|
removeInstructions,
|
|
67
83
|
updateInstructions,
|
|
68
84
|
} from './generation.js';
|
|
85
|
+
import type { TimedString } from './io.js';
|
|
69
86
|
import { SpeechHandle } from './speech_handle.js';
|
|
87
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
88
|
+
|
|
89
|
+
export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
|
|
90
|
+
export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
|
|
91
|
+
|
|
92
|
+
interface OnEnterData {
|
|
93
|
+
session: AgentSession;
|
|
94
|
+
agent: Agent;
|
|
95
|
+
}
|
|
70
96
|
|
|
71
|
-
|
|
72
|
-
|
|
97
|
+
interface PreemptiveGeneration {
|
|
98
|
+
speechHandle: SpeechHandle;
|
|
99
|
+
userMessage: ChatMessage;
|
|
100
|
+
info: PreemptiveGenerationInfo;
|
|
101
|
+
chatCtx: ChatContext;
|
|
102
|
+
tools: ToolContext;
|
|
103
|
+
toolChoice: ToolChoice | null;
|
|
104
|
+
createdAt: number;
|
|
105
|
+
}
|
|
73
106
|
|
|
107
|
+
// TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes
|
|
74
108
|
export class AgentActivity implements RecognitionHooks {
|
|
109
|
+
agent: Agent;
|
|
110
|
+
agentSession: AgentSession;
|
|
111
|
+
|
|
75
112
|
private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
|
|
113
|
+
|
|
76
114
|
private started = false;
|
|
77
115
|
private audioRecognition?: AudioRecognition;
|
|
78
116
|
private realtimeSession?: RealtimeSession;
|
|
79
|
-
private
|
|
117
|
+
private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
|
|
118
|
+
private turnDetectionMode?: TurnDetectionMode;
|
|
80
119
|
private logger = log();
|
|
81
|
-
private
|
|
120
|
+
private _schedulingPaused = true;
|
|
121
|
+
private _drainBlockedTasks: Task<any>[] = [];
|
|
82
122
|
private _currentSpeech?: SpeechHandle;
|
|
83
123
|
private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
|
|
84
124
|
private q_updated: Future;
|
|
85
125
|
private speechTasks: Set<Task<void>> = new Set();
|
|
86
126
|
private lock = new Mutex();
|
|
87
|
-
private audioStream = new
|
|
127
|
+
private audioStream = new MultiInputStream<AudioFrame>();
|
|
128
|
+
private audioStreamId?: string;
|
|
129
|
+
|
|
88
130
|
// default to null as None, which maps to the default provider tool choice value
|
|
89
131
|
private toolChoice: ToolChoice | null = null;
|
|
132
|
+
private _preemptiveGeneration?: PreemptiveGeneration;
|
|
133
|
+
private interruptionDetector?: AdaptiveInterruptionDetector;
|
|
134
|
+
private isInterruptionDetectionEnabled: boolean;
|
|
135
|
+
private isInterruptionByAudioActivityEnabled: boolean;
|
|
136
|
+
private isDefaultInterruptionByAudioActivityEnabled: boolean;
|
|
90
137
|
|
|
91
|
-
|
|
92
|
-
|
|
138
|
+
private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent): void =>
|
|
139
|
+
this.onGenerationCreated(ev);
|
|
140
|
+
|
|
141
|
+
private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent): void =>
|
|
142
|
+
this.onInputSpeechStarted(ev);
|
|
143
|
+
|
|
144
|
+
private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent): void =>
|
|
145
|
+
this.onInputSpeechStopped(ev);
|
|
146
|
+
|
|
147
|
+
private readonly onRealtimeInputAudioTranscriptionCompleted = (
|
|
148
|
+
ev: InputTranscriptionCompleted,
|
|
149
|
+
): void => this.onInputAudioTranscriptionCompleted(ev);
|
|
150
|
+
|
|
151
|
+
private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError): void =>
|
|
152
|
+
this.onError(ev);
|
|
153
|
+
|
|
154
|
+
private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
|
|
155
|
+
this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev);
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
|
|
159
|
+
this.agentSession._usageCollector.collect(ev);
|
|
160
|
+
this.agentSession.emit(
|
|
161
|
+
AgentSessionEventTypes.MetricsCollected,
|
|
162
|
+
createMetricsCollectedEvent({ metrics: ev }),
|
|
163
|
+
);
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
|
|
167
|
+
const errorEvent = createErrorEvent(ev, this.interruptionDetector);
|
|
168
|
+
this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
|
|
169
|
+
|
|
170
|
+
if (!ev.recoverable) {
|
|
171
|
+
this.agentSession._onError(ev);
|
|
172
|
+
this.fallbackToVadInterruption();
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
this.agentSession._onError(ev);
|
|
177
|
+
};
|
|
93
178
|
|
|
94
179
|
/** @internal */
|
|
95
180
|
_mainTask?: Task<void>;
|
|
96
|
-
|
|
181
|
+
_onEnterTask?: Task<void>;
|
|
182
|
+
_onExitTask?: Task<void>;
|
|
183
|
+
_userTurnCompletedTask?: Task<void>;
|
|
97
184
|
|
|
98
185
|
constructor(agent: Agent, agentSession: AgentSession) {
|
|
99
186
|
this.agent = agent;
|
|
@@ -114,7 +201,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
114
201
|
|
|
115
202
|
if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
|
|
116
203
|
this.logger.warn(
|
|
117
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
204
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
|
|
118
205
|
);
|
|
119
206
|
this.turnDetectionMode = undefined;
|
|
120
207
|
}
|
|
@@ -177,104 +264,172 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
177
264
|
if (
|
|
178
265
|
!this.vad &&
|
|
179
266
|
this.stt &&
|
|
267
|
+
!this.stt.capabilities.streaming &&
|
|
180
268
|
this.llm instanceof LLM &&
|
|
181
269
|
this.allowInterruptions &&
|
|
182
270
|
this.turnDetectionMode === undefined
|
|
183
271
|
) {
|
|
184
272
|
this.logger.warn(
|
|
185
|
-
'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
|
|
273
|
+
'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
|
|
186
274
|
'for more responsive interruption handling.',
|
|
187
275
|
);
|
|
188
276
|
}
|
|
277
|
+
|
|
278
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
279
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
280
|
+
|
|
281
|
+
// this allows taking over audio interruption temporarily until interruption is detected
|
|
282
|
+
// by default is is ture unless turnDetection is manual or realtime_llm
|
|
283
|
+
this.isInterruptionByAudioActivityEnabled =
|
|
284
|
+
this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
|
|
285
|
+
|
|
286
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
189
287
|
}
|
|
190
288
|
|
|
191
289
|
async start(): Promise<void> {
|
|
192
290
|
const unlock = await this.lock.lock();
|
|
193
291
|
try {
|
|
194
|
-
this.
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
|
|
200
|
-
this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
|
|
201
|
-
this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
|
|
202
|
-
this.onInputAudioTranscriptionCompleted(ev),
|
|
203
|
-
);
|
|
204
|
-
this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
|
|
205
|
-
this.realtimeSession.on('error', (ev) => this.onError(ev));
|
|
206
|
-
|
|
207
|
-
removeInstructions(this.agent._chatCtx);
|
|
208
|
-
try {
|
|
209
|
-
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
210
|
-
} catch (error) {
|
|
211
|
-
this.logger.error(error, 'failed to update the instructions');
|
|
212
|
-
}
|
|
292
|
+
await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
|
|
293
|
+
} finally {
|
|
294
|
+
unlock();
|
|
295
|
+
}
|
|
296
|
+
}
|
|
213
297
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
298
|
+
async resume(): Promise<void> {
|
|
299
|
+
const unlock = await this.lock.lock();
|
|
300
|
+
try {
|
|
301
|
+
await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
|
|
302
|
+
} finally {
|
|
303
|
+
unlock();
|
|
304
|
+
}
|
|
305
|
+
}
|
|
219
306
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
307
|
+
private async _startSession(options: {
|
|
308
|
+
spanName: 'start_agent_activity' | 'resume_agent_activity';
|
|
309
|
+
runOnEnter: boolean;
|
|
310
|
+
}): Promise<void> {
|
|
311
|
+
const { spanName, runOnEnter } = options;
|
|
312
|
+
const startSpan = tracer.startSpan({
|
|
313
|
+
name: spanName,
|
|
314
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
315
|
+
context: ROOT_CONTEXT,
|
|
316
|
+
});
|
|
317
|
+
|
|
318
|
+
this.agent._agentActivity = this;
|
|
319
|
+
|
|
320
|
+
if (this.llm instanceof RealtimeModel) {
|
|
321
|
+
this.realtimeSession = this.llm.session();
|
|
322
|
+
this.realtimeSpans = new Map<string, Span>();
|
|
323
|
+
this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
|
|
324
|
+
this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
|
|
325
|
+
this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
|
|
326
|
+
this.realtimeSession.on(
|
|
327
|
+
'input_audio_transcription_completed',
|
|
328
|
+
this.onRealtimeInputAudioTranscriptionCompleted,
|
|
329
|
+
);
|
|
330
|
+
this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
|
|
331
|
+
this.realtimeSession.on('error', this.onModelError);
|
|
236
332
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
this.
|
|
240
|
-
|
|
333
|
+
removeInstructions(this.agent._chatCtx);
|
|
334
|
+
try {
|
|
335
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
336
|
+
} catch (error) {
|
|
337
|
+
this.logger.error(error, 'failed to update the instructions');
|
|
241
338
|
}
|
|
242
339
|
|
|
243
|
-
|
|
244
|
-
this.
|
|
245
|
-
|
|
340
|
+
try {
|
|
341
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
342
|
+
} catch (error) {
|
|
343
|
+
this.logger.error(error, 'failed to update the chat context');
|
|
246
344
|
}
|
|
247
345
|
|
|
248
|
-
|
|
249
|
-
this.
|
|
250
|
-
|
|
346
|
+
try {
|
|
347
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
348
|
+
} catch (error) {
|
|
349
|
+
this.logger.error(error, 'failed to update the tools');
|
|
251
350
|
}
|
|
252
351
|
|
|
253
|
-
if (this.
|
|
254
|
-
this.
|
|
352
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
353
|
+
this.logger.error(
|
|
354
|
+
'audio output is enabled but RealtimeModel has no audio modality ' +
|
|
355
|
+
'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
|
|
356
|
+
'or set a TTS model.',
|
|
357
|
+
);
|
|
358
|
+
}
|
|
359
|
+
} else if (this.llm instanceof LLM) {
|
|
360
|
+
try {
|
|
361
|
+
updateInstructions({
|
|
362
|
+
chatCtx: this.agent._chatCtx,
|
|
363
|
+
instructions: this.agent.instructions,
|
|
364
|
+
addIfMissing: true,
|
|
365
|
+
});
|
|
366
|
+
} catch (error) {
|
|
367
|
+
this.logger.error('failed to update the instructions', error);
|
|
255
368
|
}
|
|
369
|
+
}
|
|
256
370
|
|
|
257
|
-
|
|
258
|
-
recognitionHooks: this,
|
|
259
|
-
// Disable stt node if stt is not provided
|
|
260
|
-
stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
|
|
261
|
-
vad: this.vad,
|
|
262
|
-
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
263
|
-
turnDetectionMode: this.turnDetectionMode,
|
|
264
|
-
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
265
|
-
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
266
|
-
});
|
|
267
|
-
this.audioRecognition.start();
|
|
268
|
-
this.started = true;
|
|
371
|
+
// TODO(parity): Record initial AgentConfigUpdate in chat context
|
|
269
372
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
373
|
+
// metrics and error handling
|
|
374
|
+
if (this.llm instanceof LLM) {
|
|
375
|
+
this.llm.on('metrics_collected', this.onMetricsCollected);
|
|
376
|
+
this.llm.on('error', this.onModelError);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
if (this.stt instanceof STT) {
|
|
380
|
+
this.stt.on('metrics_collected', this.onMetricsCollected);
|
|
381
|
+
this.stt.on('error', this.onModelError);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
if (this.tts instanceof TTS) {
|
|
385
|
+
this.tts.on('metrics_collected', this.onMetricsCollected);
|
|
386
|
+
this.tts.on('error', this.onModelError);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
if (this.vad instanceof VAD) {
|
|
390
|
+
this.vad.on('metrics_collected', this.onMetricsCollected);
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
this.audioRecognition = new AudioRecognition({
|
|
394
|
+
recognitionHooks: this,
|
|
395
|
+
// Disable stt node if stt is not provided
|
|
396
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
|
|
397
|
+
vad: this.vad,
|
|
398
|
+
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
399
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
400
|
+
interruptionDetection: this.interruptionDetector,
|
|
401
|
+
minEndpointingDelay:
|
|
402
|
+
this.agent.turnHandling?.endpointing?.minDelay ??
|
|
403
|
+
this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
|
|
404
|
+
maxEndpointingDelay:
|
|
405
|
+
this.agent.turnHandling?.endpointing?.maxDelay ??
|
|
406
|
+
this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
|
|
407
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
408
|
+
sttModel: this.stt?.label,
|
|
409
|
+
sttProvider: this.getSttProvider(),
|
|
410
|
+
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
|
|
411
|
+
});
|
|
412
|
+
this.audioRecognition.start();
|
|
413
|
+
this.started = true;
|
|
414
|
+
|
|
415
|
+
this._resumeSchedulingTask();
|
|
416
|
+
|
|
417
|
+
if (runOnEnter) {
|
|
418
|
+
this._onEnterTask = this.createSpeechTask({
|
|
419
|
+
taskFn: () =>
|
|
420
|
+
onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
|
|
421
|
+
tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
422
|
+
name: 'on_enter',
|
|
423
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
424
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
425
|
+
}),
|
|
426
|
+
),
|
|
427
|
+
inlineTask: true,
|
|
273
428
|
name: 'AgentActivity_onEnter',
|
|
274
429
|
});
|
|
275
|
-
} finally {
|
|
276
|
-
unlock();
|
|
277
430
|
}
|
|
431
|
+
|
|
432
|
+
startSpan.end();
|
|
278
433
|
}
|
|
279
434
|
|
|
280
435
|
get currentSpeech(): SpeechHandle | undefined {
|
|
@@ -289,6 +444,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
289
444
|
return this.agent.stt || this.agentSession.stt;
|
|
290
445
|
}
|
|
291
446
|
|
|
447
|
+
private getSttProvider(): string | undefined {
|
|
448
|
+
const label = this.stt?.label;
|
|
449
|
+
if (!label) {
|
|
450
|
+
return undefined;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Heuristic: most labels look like "<provider>-<model>"
|
|
454
|
+
const [provider] = label.split('-', 1);
|
|
455
|
+
return provider || label;
|
|
456
|
+
}
|
|
457
|
+
|
|
292
458
|
get llm(): LLM | RealtimeModel | undefined {
|
|
293
459
|
return this.agent.llm || this.agentSession.llm;
|
|
294
460
|
}
|
|
@@ -301,8 +467,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
301
467
|
return this.agent.toolCtx;
|
|
302
468
|
}
|
|
303
469
|
|
|
304
|
-
get
|
|
305
|
-
return this.
|
|
470
|
+
get schedulingPaused(): boolean {
|
|
471
|
+
return this._schedulingPaused;
|
|
306
472
|
}
|
|
307
473
|
|
|
308
474
|
get realtimeLLMSession(): RealtimeSession | undefined {
|
|
@@ -310,19 +476,48 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
310
476
|
}
|
|
311
477
|
|
|
312
478
|
get allowInterruptions(): boolean {
|
|
313
|
-
|
|
314
|
-
|
|
479
|
+
return (
|
|
480
|
+
this.agent.turnHandling?.interruption?.enabled ??
|
|
481
|
+
this.agentSession.sessionOptions.turnHandling.interruption.enabled
|
|
482
|
+
);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
get useTtsAlignedTranscript(): boolean {
|
|
486
|
+
// Agent setting takes precedence over session setting
|
|
487
|
+
return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
|
|
315
488
|
}
|
|
316
489
|
|
|
317
490
|
get turnDetection(): TurnDetectionMode | undefined {
|
|
318
|
-
|
|
319
|
-
|
|
491
|
+
return this.agent.turnHandling?.turnDetection ?? this.agentSession.turnDetection;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
get turnHandling() {
|
|
495
|
+
return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
|
|
320
496
|
}
|
|
321
497
|
|
|
498
|
+
// get minEndpointingDelay(): number {
|
|
499
|
+
// return (
|
|
500
|
+
// this.agent.turnHandling?.endpointing?.minDelay ??
|
|
501
|
+
// this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
|
|
502
|
+
// );
|
|
503
|
+
// }
|
|
504
|
+
|
|
505
|
+
// get maxEndpointingDelay(): number {
|
|
506
|
+
// return (
|
|
507
|
+
// this.agent.turnHandling?.endpointing?.maxDelay ??
|
|
508
|
+
// this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
|
|
509
|
+
// );
|
|
510
|
+
// }
|
|
511
|
+
|
|
322
512
|
get toolCtx(): ToolContext {
|
|
323
513
|
return this.agent.toolCtx;
|
|
324
514
|
}
|
|
325
515
|
|
|
516
|
+
/** @internal */
|
|
517
|
+
get inputStartedAt() {
|
|
518
|
+
return this.audioRecognition?.inputStartedAt;
|
|
519
|
+
}
|
|
520
|
+
|
|
326
521
|
async updateChatCtx(chatCtx: ChatContext): Promise<void> {
|
|
327
522
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
328
523
|
|
|
@@ -340,7 +535,27 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
340
535
|
}
|
|
341
536
|
}
|
|
342
537
|
|
|
343
|
-
|
|
538
|
+
// TODO: Add when AgentConfigUpdate is ported to ChatContext.
|
|
539
|
+
async updateTools(tools: ToolContext): Promise<void> {
|
|
540
|
+
this.agent._tools = { ...tools };
|
|
541
|
+
|
|
542
|
+
if (this.realtimeSession) {
|
|
543
|
+
await this.realtimeSession.updateTools(tools);
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
if (this.llm instanceof LLM) {
|
|
547
|
+
// for realtime LLM, we assume the server will remove unvalid tool messages
|
|
548
|
+
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
updateOptions({
|
|
553
|
+
toolChoice,
|
|
554
|
+
turnDetection,
|
|
555
|
+
}: {
|
|
556
|
+
toolChoice?: ToolChoice | null;
|
|
557
|
+
turnDetection?: TurnDetectionMode;
|
|
558
|
+
}): void {
|
|
344
559
|
if (toolChoice !== undefined) {
|
|
345
560
|
this.toolChoice = toolChoice;
|
|
346
561
|
}
|
|
@@ -348,43 +563,85 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
348
563
|
if (this.realtimeSession) {
|
|
349
564
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
350
565
|
}
|
|
566
|
+
|
|
567
|
+
if (turnDetection !== undefined) {
|
|
568
|
+
this.turnDetectionMode = turnDetection;
|
|
569
|
+
this.isDefaultInterruptionByAudioActivityEnabled =
|
|
570
|
+
this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
|
|
571
|
+
|
|
572
|
+
// sync live flag immediately when not speaking so the change takes effect right away
|
|
573
|
+
if (this.agentSession.agentState !== 'speaking') {
|
|
574
|
+
this.isInterruptionByAudioActivityEnabled =
|
|
575
|
+
this.isDefaultInterruptionByAudioActivityEnabled;
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
if (this.audioRecognition) {
|
|
580
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
581
|
+
}
|
|
351
582
|
}
|
|
352
583
|
|
|
353
584
|
attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
585
|
+
void this.audioStream.close();
|
|
586
|
+
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
587
|
+
|
|
588
|
+
// Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
|
|
589
|
+
// than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
|
|
590
|
+
// if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
|
|
591
|
+
// and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
|
|
592
|
+
const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
|
|
593
|
+
transform: (frame, controller) => {
|
|
594
|
+
const shouldDiscardForAecWarmup =
|
|
595
|
+
this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
|
|
596
|
+
if (!shouldDiscardForAecWarmup) {
|
|
597
|
+
controller.enqueue(frame);
|
|
598
|
+
}
|
|
599
|
+
},
|
|
600
|
+
});
|
|
358
601
|
|
|
359
|
-
|
|
360
|
-
* We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
|
|
361
|
-
* The tee() operation should be applied to the deferred stream, not the original audioStream.
|
|
362
|
-
* This is important because teeing the original stream directly makes it very difficult—if not
|
|
363
|
-
* impossible—to implement stream unlock logic cleanly.
|
|
364
|
-
*/
|
|
365
|
-
this.audioStream.setSource(audioStream);
|
|
366
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
602
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
367
603
|
|
|
368
|
-
if (this.realtimeSession) {
|
|
604
|
+
if (this.realtimeSession && this.audioRecognition) {
|
|
605
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
|
|
606
|
+
.pipeThrough(aecWarmupAudioFilter)
|
|
607
|
+
.tee();
|
|
369
608
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
if (this.audioRecognition) {
|
|
373
609
|
this.audioRecognition.setInputAudioStream(recognitionAudioStream);
|
|
610
|
+
} else if (this.realtimeSession) {
|
|
611
|
+
this.realtimeSession.setInputAudioStream(
|
|
612
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
|
|
613
|
+
);
|
|
614
|
+
} else if (this.audioRecognition) {
|
|
615
|
+
this.audioRecognition.setInputAudioStream(
|
|
616
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
|
|
617
|
+
);
|
|
374
618
|
}
|
|
375
619
|
}
|
|
376
620
|
|
|
377
621
|
detachAudioInput(): void {
|
|
378
|
-
this.
|
|
622
|
+
if (this.audioStreamId === undefined) {
|
|
623
|
+
return;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
void this.audioStream.close();
|
|
627
|
+
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
628
|
+
this.audioStreamId = undefined;
|
|
379
629
|
}
|
|
380
630
|
|
|
381
|
-
commitUserTurn(
|
|
631
|
+
commitUserTurn(
|
|
632
|
+
options: {
|
|
633
|
+
audioDetached?: boolean;
|
|
634
|
+
throwIfNotReady?: boolean;
|
|
635
|
+
} = {},
|
|
636
|
+
) {
|
|
637
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
382
638
|
if (!this.audioRecognition) {
|
|
383
|
-
|
|
639
|
+
if (throwIfNotReady) {
|
|
640
|
+
throw new Error('AudioRecognition is not initialized');
|
|
641
|
+
}
|
|
642
|
+
return;
|
|
384
643
|
}
|
|
385
644
|
|
|
386
|
-
// TODO(brian): add audio_detached flag
|
|
387
|
-
const audioDetached = false;
|
|
388
645
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
389
646
|
}
|
|
390
647
|
|
|
@@ -442,14 +699,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
442
699
|
}),
|
|
443
700
|
);
|
|
444
701
|
const task = this.createSpeechTask({
|
|
445
|
-
|
|
702
|
+
taskFn: (abortController: AbortController) =>
|
|
446
703
|
this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
447
|
-
),
|
|
448
704
|
ownedSpeechHandle: handle,
|
|
449
705
|
name: 'AgentActivity.say_tts',
|
|
450
706
|
});
|
|
451
707
|
|
|
452
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
708
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
453
709
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
454
710
|
return handle;
|
|
455
711
|
}
|
|
@@ -463,6 +719,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
463
719
|
if (speechHandle && (ev.type === 'llm_metrics' || ev.type === 'tts_metrics')) {
|
|
464
720
|
ev.speechId = speechHandle.id;
|
|
465
721
|
}
|
|
722
|
+
|
|
723
|
+
// Record realtime metrics on the associated span (if available)
|
|
724
|
+
if (ev.type === 'realtime_model_metrics' && this.realtimeSpans) {
|
|
725
|
+
const span = this.realtimeSpans.get(ev.requestId);
|
|
726
|
+
if (span) {
|
|
727
|
+
recordRealtimeMetrics(span, ev);
|
|
728
|
+
this.realtimeSpans.delete(ev.requestId);
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
this.agentSession._usageCollector.collect(ev);
|
|
733
|
+
|
|
466
734
|
this.agentSession.emit(
|
|
467
735
|
AgentSessionEventTypes.MetricsCollected,
|
|
468
736
|
createMetricsCollectedEvent({ metrics: ev }),
|
|
@@ -494,6 +762,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
494
762
|
|
|
495
763
|
if (!this.vad) {
|
|
496
764
|
this.agentSession._updateUserState('speaking');
|
|
765
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
766
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
767
|
+
0,
|
|
768
|
+
Date.now(),
|
|
769
|
+
this.agentSession._userSpeakingSpan,
|
|
770
|
+
);
|
|
771
|
+
}
|
|
497
772
|
}
|
|
498
773
|
|
|
499
774
|
// this.interrupt() is going to raise when allow_interruptions is False,
|
|
@@ -512,6 +787,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
512
787
|
this.logger.info(ev, 'onInputSpeechStopped');
|
|
513
788
|
|
|
514
789
|
if (!this.vad) {
|
|
790
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
791
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
792
|
+
}
|
|
515
793
|
this.agentSession._updateUserState('listening');
|
|
516
794
|
}
|
|
517
795
|
|
|
@@ -552,10 +830,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
552
830
|
return;
|
|
553
831
|
}
|
|
554
832
|
|
|
555
|
-
if (this.
|
|
556
|
-
// copied from python:
|
|
833
|
+
if (this.schedulingPaused) {
|
|
557
834
|
// TODO(shubhra): should we "forward" this new turn to the next agent?
|
|
558
|
-
this.logger.warn('skipping new realtime generation, the
|
|
835
|
+
this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
|
|
559
836
|
return;
|
|
560
837
|
}
|
|
561
838
|
|
|
@@ -573,9 +850,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
573
850
|
this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
|
|
574
851
|
|
|
575
852
|
this.createSpeechTask({
|
|
576
|
-
|
|
853
|
+
taskFn: (abortController: AbortController) =>
|
|
577
854
|
this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
578
|
-
),
|
|
579
855
|
ownedSpeechHandle: handle,
|
|
580
856
|
name: 'AgentActivity.realtimeGeneration',
|
|
581
857
|
});
|
|
@@ -584,13 +860,43 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
584
860
|
}
|
|
585
861
|
|
|
586
862
|
// recognition hooks
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
863
|
+
onStartOfSpeech(ev: VADEvent): void {
|
|
864
|
+
let speechStartTime = Date.now();
|
|
865
|
+
if (ev) {
|
|
866
|
+
// Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
|
|
867
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
868
|
+
}
|
|
869
|
+
this.agentSession._updateUserState('speaking', {
|
|
870
|
+
lastSpeakingTime: speechStartTime,
|
|
871
|
+
otelContext: otelContext.active(),
|
|
872
|
+
});
|
|
873
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
874
|
+
// Pass speechStartTime as the absolute startedAt timestamp.
|
|
875
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
876
|
+
ev.speechDuration,
|
|
877
|
+
speechStartTime,
|
|
878
|
+
this.agentSession._userSpeakingSpan,
|
|
879
|
+
);
|
|
880
|
+
}
|
|
590
881
|
}
|
|
591
882
|
|
|
592
|
-
onEndOfSpeech(
|
|
593
|
-
|
|
883
|
+
onEndOfSpeech(ev: VADEvent): void {
|
|
884
|
+
let speechEndTime = Date.now();
|
|
885
|
+
if (ev) {
|
|
886
|
+
// Subtract both silenceDuration and inferenceDuration to correct for VAD model latency.
|
|
887
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
888
|
+
}
|
|
889
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
890
|
+
// Pass speechEndTime as the absolute endedAt timestamp.
|
|
891
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
892
|
+
speechEndTime,
|
|
893
|
+
this.agentSession._userSpeakingSpan,
|
|
894
|
+
);
|
|
895
|
+
}
|
|
896
|
+
this.agentSession._updateUserState('listening', {
|
|
897
|
+
lastSpeakingTime: speechEndTime,
|
|
898
|
+
otelContext: otelContext.active(),
|
|
899
|
+
});
|
|
594
900
|
}
|
|
595
901
|
|
|
596
902
|
onVADInferenceDone(ev: VADEvent): void {
|
|
@@ -599,20 +905,47 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
599
905
|
return;
|
|
600
906
|
}
|
|
601
907
|
|
|
602
|
-
if (
|
|
603
|
-
|
|
908
|
+
if (
|
|
909
|
+
ev.speechDuration >= this.agentSession.sessionOptions.turnHandling.interruption?.minDuration
|
|
910
|
+
) {
|
|
911
|
+
this.interruptByAudioActivity();
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
private interruptByAudioActivity(): void {
|
|
916
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
604
917
|
return;
|
|
605
918
|
}
|
|
606
919
|
|
|
607
|
-
if (
|
|
920
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
921
|
+
// Disable interruption from audio activity while AEC warmup is active.
|
|
608
922
|
return;
|
|
609
923
|
}
|
|
610
924
|
|
|
611
|
-
if (this.
|
|
612
|
-
|
|
925
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
926
|
+
// skip speech handle interruption if server side turn detection is enabled
|
|
927
|
+
return;
|
|
928
|
+
}
|
|
613
929
|
|
|
930
|
+
// Refactored interruption word count check:
|
|
931
|
+
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
|
|
932
|
+
// - Apply check to all STT results: empty string, undefined, or any length
|
|
933
|
+
// - This ensures consistent behavior across all interruption scenarios
|
|
934
|
+
if (
|
|
935
|
+
this.stt &&
|
|
936
|
+
this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0 &&
|
|
937
|
+
this.audioRecognition
|
|
938
|
+
) {
|
|
939
|
+
const text = this.audioRecognition.currentTranscript;
|
|
614
940
|
// TODO(shubhra): better word splitting for multi-language
|
|
615
|
-
|
|
941
|
+
|
|
942
|
+
// Normalize text: convert undefined/null to empty string for consistent word counting
|
|
943
|
+
const normalizedText = text ?? '';
|
|
944
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
945
|
+
|
|
946
|
+
// Only allow interruption if word count meets or exceeds minInterruptionWords
|
|
947
|
+
// This applies to all cases: empty strings, partial speech, and full speech
|
|
948
|
+
if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
|
|
616
949
|
return;
|
|
617
950
|
}
|
|
618
951
|
}
|
|
@@ -624,12 +957,23 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
624
957
|
!this._currentSpeech.interrupted &&
|
|
625
958
|
this._currentSpeech.allowInterruptions
|
|
626
959
|
) {
|
|
627
|
-
this.logger.info(
|
|
960
|
+
this.logger.info(
|
|
961
|
+
{ 'speech id': this._currentSpeech.id },
|
|
962
|
+
'speech interrupted by audio activity',
|
|
963
|
+
);
|
|
628
964
|
this.realtimeSession?.interrupt();
|
|
629
965
|
this._currentSpeech.interrupt();
|
|
630
966
|
}
|
|
631
967
|
}
|
|
632
968
|
|
|
969
|
+
onInterruption(ev: OverlappingSpeechEvent) {
|
|
970
|
+
this.restoreInterruptionByAudioActivity();
|
|
971
|
+
this.interruptByAudioActivity();
|
|
972
|
+
if (this.audioRecognition) {
|
|
973
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
|
|
633
977
|
onInterimTranscript(ev: SpeechEvent): void {
|
|
634
978
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
635
979
|
// skip stt transcription if userTranscription is enabled on the realtime model
|
|
@@ -641,9 +985,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
641
985
|
createUserInputTranscribedEvent({
|
|
642
986
|
transcript: ev.alternatives![0].text,
|
|
643
987
|
isFinal: false,
|
|
988
|
+
language: ev.alternatives![0].language,
|
|
644
989
|
// TODO(AJS-106): add multi participant support
|
|
645
990
|
}),
|
|
646
991
|
);
|
|
992
|
+
|
|
993
|
+
if (ev.alternatives![0].text) {
|
|
994
|
+
this.interruptByAudioActivity();
|
|
995
|
+
}
|
|
647
996
|
}
|
|
648
997
|
|
|
649
998
|
onFinalTranscript(ev: SpeechEvent): void {
|
|
@@ -657,17 +1006,103 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
657
1006
|
createUserInputTranscribedEvent({
|
|
658
1007
|
transcript: ev.alternatives![0].text,
|
|
659
1008
|
isFinal: true,
|
|
1009
|
+
language: ev.alternatives![0].language,
|
|
660
1010
|
// TODO(AJS-106): add multi participant support
|
|
661
1011
|
}),
|
|
662
1012
|
);
|
|
1013
|
+
|
|
1014
|
+
// agent speech might not be interrupted if VAD failed and a final transcript is received
|
|
1015
|
+
// we call interruptByAudioActivity (idempotent) to pause the speech, if possible
|
|
1016
|
+
if (
|
|
1017
|
+
this.audioRecognition &&
|
|
1018
|
+
this.turnDetection !== 'manual' &&
|
|
1019
|
+
this.turnDetection !== 'realtime_llm'
|
|
1020
|
+
) {
|
|
1021
|
+
this.interruptByAudioActivity();
|
|
1022
|
+
|
|
1023
|
+
// TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
// TODO: resume false interruption - start interrupt paused speech task
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
1030
|
+
if (
|
|
1031
|
+
!this.agentSession.sessionOptions.preemptiveGeneration ||
|
|
1032
|
+
this.schedulingPaused ||
|
|
1033
|
+
(this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
|
|
1034
|
+
!(this.llm instanceof LLM)
|
|
1035
|
+
) {
|
|
1036
|
+
return;
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
this.cancelPreemptiveGeneration();
|
|
1040
|
+
|
|
1041
|
+
this.logger.info(
|
|
1042
|
+
{
|
|
1043
|
+
newTranscript: info.newTranscript,
|
|
1044
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
1045
|
+
},
|
|
1046
|
+
'starting preemptive generation',
|
|
1047
|
+
);
|
|
1048
|
+
|
|
1049
|
+
const userMessage = ChatMessage.create({
|
|
1050
|
+
role: 'user',
|
|
1051
|
+
content: info.newTranscript,
|
|
1052
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
1053
|
+
});
|
|
1054
|
+
const chatCtx = this.agent.chatCtx.copy();
|
|
1055
|
+
const speechHandle = this.generateReply({
|
|
1056
|
+
userMessage,
|
|
1057
|
+
chatCtx,
|
|
1058
|
+
scheduleSpeech: false,
|
|
1059
|
+
});
|
|
1060
|
+
|
|
1061
|
+
this._preemptiveGeneration = {
|
|
1062
|
+
speechHandle,
|
|
1063
|
+
userMessage,
|
|
1064
|
+
info,
|
|
1065
|
+
chatCtx: chatCtx.copy(),
|
|
1066
|
+
tools: { ...this.tools },
|
|
1067
|
+
toolChoice: this.toolChoice,
|
|
1068
|
+
createdAt: Date.now(),
|
|
1069
|
+
};
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
private cancelPreemptiveGeneration(): void {
|
|
1073
|
+
if (this._preemptiveGeneration !== undefined) {
|
|
1074
|
+
this._preemptiveGeneration.speechHandle._cancel();
|
|
1075
|
+
this._preemptiveGeneration = undefined;
|
|
1076
|
+
}
|
|
663
1077
|
}
|
|
664
1078
|
|
|
665
1079
|
private createSpeechTask(options: {
|
|
666
|
-
|
|
1080
|
+
taskFn: (controller: AbortController) => Promise<void>;
|
|
1081
|
+
controller?: AbortController;
|
|
667
1082
|
ownedSpeechHandle?: SpeechHandle;
|
|
1083
|
+
inlineTask?: boolean;
|
|
668
1084
|
name?: string;
|
|
669
|
-
}):
|
|
670
|
-
const {
|
|
1085
|
+
}): Task<void> {
|
|
1086
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
1087
|
+
|
|
1088
|
+
const wrappedFn = (ctrl: AbortController) => {
|
|
1089
|
+
return agentActivityStorage.run(this, () => {
|
|
1090
|
+
// Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
|
|
1091
|
+
// before post-construction metadata is attached to the Task instance.
|
|
1092
|
+
const currentTask = Task.current();
|
|
1093
|
+
if (currentTask) {
|
|
1094
|
+
_setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
if (ownedSpeechHandle) {
|
|
1098
|
+
return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
1099
|
+
}
|
|
1100
|
+
return taskFn(ctrl);
|
|
1101
|
+
});
|
|
1102
|
+
};
|
|
1103
|
+
|
|
1104
|
+
const task = Task.from(wrappedFn, controller, name);
|
|
1105
|
+
_setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
671
1106
|
|
|
672
1107
|
this.speechTasks.add(task);
|
|
673
1108
|
task.addDoneCallback(() => {
|
|
@@ -687,34 +1122,50 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
687
1122
|
this.wakeupMainTask();
|
|
688
1123
|
});
|
|
689
1124
|
|
|
690
|
-
return task
|
|
1125
|
+
return task;
|
|
691
1126
|
}
|
|
692
1127
|
|
|
693
1128
|
async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
|
|
694
|
-
if (this.
|
|
695
|
-
this.
|
|
696
|
-
|
|
1129
|
+
if (this.schedulingPaused) {
|
|
1130
|
+
this.cancelPreemptiveGeneration();
|
|
1131
|
+
this.logger.warn(
|
|
1132
|
+
{ user_input: info.newTranscript },
|
|
1133
|
+
'skipping user input, speech scheduling is paused',
|
|
1134
|
+
);
|
|
697
1135
|
// TODO(shubhra): should we "forward" this new turn to the next agent/activity?
|
|
698
1136
|
return true;
|
|
699
1137
|
}
|
|
700
1138
|
|
|
1139
|
+
// Refactored interruption word count check for consistency with onVADInferenceDone:
|
|
1140
|
+
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
|
|
1141
|
+
// - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
|
|
701
1142
|
if (
|
|
702
1143
|
this.stt &&
|
|
703
1144
|
this.turnDetection !== 'manual' &&
|
|
704
1145
|
this._currentSpeech &&
|
|
705
1146
|
this._currentSpeech.allowInterruptions &&
|
|
706
1147
|
!this._currentSpeech.interrupted &&
|
|
707
|
-
this.agentSession.
|
|
708
|
-
info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
|
|
1148
|
+
this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0
|
|
709
1149
|
) {
|
|
710
|
-
|
|
711
|
-
this.
|
|
712
|
-
|
|
1150
|
+
const wordCount = splitWords(info.newTranscript, true).length;
|
|
1151
|
+
if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
|
|
1152
|
+
// avoid interruption if the new_transcript contains fewer words than minInterruptionWords
|
|
1153
|
+
this.cancelPreemptiveGeneration();
|
|
1154
|
+
this.logger.info(
|
|
1155
|
+
{
|
|
1156
|
+
wordCount,
|
|
1157
|
+
minInterruptionWords:
|
|
1158
|
+
this.agentSession.sessionOptions.turnHandling.interruption.minWords,
|
|
1159
|
+
},
|
|
1160
|
+
'skipping user input, word count below minimum interruption threshold',
|
|
1161
|
+
);
|
|
1162
|
+
return false;
|
|
1163
|
+
}
|
|
713
1164
|
}
|
|
714
1165
|
|
|
715
1166
|
const oldTask = this._userTurnCompletedTask;
|
|
716
1167
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
717
|
-
|
|
1168
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
718
1169
|
name: 'AgentActivity.userTurnCompleted',
|
|
719
1170
|
});
|
|
720
1171
|
return true;
|
|
@@ -744,16 +1195,28 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
744
1195
|
throw new Error('Speech queue is empty');
|
|
745
1196
|
}
|
|
746
1197
|
const speechHandle = heapItem[2];
|
|
1198
|
+
|
|
1199
|
+
// Skip speech handles that were already interrupted/done before being
|
|
1200
|
+
// picked up from the queue (e.g. interrupted during shutdown before the
|
|
1201
|
+
// main loop had a chance to process them). Calling _authorizeGeneration
|
|
1202
|
+
// on a done handle would create a generation Future that nobody resolves,
|
|
1203
|
+
// causing the main loop to hang forever.
|
|
1204
|
+
if (speechHandle.interrupted || speechHandle.done()) {
|
|
1205
|
+
continue;
|
|
1206
|
+
}
|
|
1207
|
+
|
|
747
1208
|
this._currentSpeech = speechHandle;
|
|
748
1209
|
speechHandle._authorizeGeneration();
|
|
749
|
-
await speechHandle._waitForGeneration();
|
|
1210
|
+
await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
|
|
750
1211
|
this._currentSpeech = undefined;
|
|
751
1212
|
}
|
|
752
1213
|
|
|
753
|
-
//
|
|
754
|
-
//
|
|
755
|
-
|
|
756
|
-
|
|
1214
|
+
// if we're draining/pausing and there are no more speech tasks, we can exit.
|
|
1215
|
+
// only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
|
|
1216
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
1217
|
+
|
|
1218
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
1219
|
+
this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
|
|
757
1220
|
break;
|
|
758
1221
|
}
|
|
759
1222
|
|
|
@@ -763,6 +1226,39 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
763
1226
|
this.logger.info('AgentActivity mainTask: exiting');
|
|
764
1227
|
}
|
|
765
1228
|
|
|
1229
|
+
private getDrainPendingSpeechTasks(): Task<void>[] {
|
|
1230
|
+
const blockedHandles: SpeechHandle[] = [];
|
|
1231
|
+
|
|
1232
|
+
for (const task of this._drainBlockedTasks) {
|
|
1233
|
+
const info = _getActivityTaskInfo(task);
|
|
1234
|
+
if (!info) {
|
|
1235
|
+
this.logger.error('blocked task without activity info; skipping.');
|
|
1236
|
+
continue;
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
if (!info.speechHandle) {
|
|
1240
|
+
continue; // onEnter/onExit
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
blockedHandles.push(info.speechHandle);
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
const toWait: Task<void>[] = [];
|
|
1247
|
+
for (const task of this.speechTasks) {
|
|
1248
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
1249
|
+
continue;
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
const info = _getActivityTaskInfo(task);
|
|
1253
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
1254
|
+
continue;
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1257
|
+
toWait.push(task);
|
|
1258
|
+
}
|
|
1259
|
+
return toWait;
|
|
1260
|
+
}
|
|
1261
|
+
|
|
766
1262
|
private wakeupMainTask(): void {
|
|
767
1263
|
this.q_updated.resolve();
|
|
768
1264
|
}
|
|
@@ -773,6 +1269,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
773
1269
|
instructions?: string;
|
|
774
1270
|
toolChoice?: ToolChoice | null;
|
|
775
1271
|
allowInterruptions?: boolean;
|
|
1272
|
+
scheduleSpeech?: boolean;
|
|
776
1273
|
}): SpeechHandle {
|
|
777
1274
|
const {
|
|
778
1275
|
userMessage,
|
|
@@ -780,6 +1277,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
780
1277
|
instructions: defaultInstructions,
|
|
781
1278
|
toolChoice: defaultToolChoice,
|
|
782
1279
|
allowInterruptions: defaultAllowInterruptions,
|
|
1280
|
+
scheduleSpeech = true,
|
|
783
1281
|
} = options;
|
|
784
1282
|
|
|
785
1283
|
let instructions = defaultInstructions;
|
|
@@ -802,7 +1300,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
802
1300
|
throw new Error('trying to generate reply without an LLM model');
|
|
803
1301
|
}
|
|
804
1302
|
|
|
805
|
-
const functionCall =
|
|
1303
|
+
const functionCall = functionCallStorage.getStore()?.functionCall;
|
|
806
1304
|
if (toolChoice === undefined && functionCall !== undefined) {
|
|
807
1305
|
// when generateReply is called inside a tool, set toolChoice to 'none' by default
|
|
808
1306
|
toolChoice = 'none';
|
|
@@ -824,7 +1322,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
824
1322
|
|
|
825
1323
|
if (this.llm instanceof RealtimeModel) {
|
|
826
1324
|
this.createSpeechTask({
|
|
827
|
-
|
|
1325
|
+
taskFn: (abortController: AbortController) =>
|
|
828
1326
|
this.realtimeReplyTask({
|
|
829
1327
|
speechHandle: handle,
|
|
830
1328
|
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
@@ -836,7 +1334,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
836
1334
|
},
|
|
837
1335
|
abortController,
|
|
838
1336
|
}),
|
|
839
|
-
),
|
|
840
1337
|
ownedSpeechHandle: handle,
|
|
841
1338
|
name: 'AgentActivity.realtimeReply',
|
|
842
1339
|
});
|
|
@@ -848,46 +1345,80 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
848
1345
|
instructions = `${this.agent.instructions}\n${instructions}`;
|
|
849
1346
|
}
|
|
850
1347
|
|
|
1348
|
+
// Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
|
|
1349
|
+
const onEnterData = onEnterStorage.getStore();
|
|
1350
|
+
const shouldFilterTools =
|
|
1351
|
+
onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
|
|
1352
|
+
|
|
1353
|
+
const tools = shouldFilterTools
|
|
1354
|
+
? Object.fromEntries(
|
|
1355
|
+
Object.entries(this.agent.toolCtx).filter(
|
|
1356
|
+
([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
|
|
1357
|
+
),
|
|
1358
|
+
)
|
|
1359
|
+
: this.agent.toolCtx;
|
|
1360
|
+
|
|
851
1361
|
const task = this.createSpeechTask({
|
|
852
|
-
|
|
1362
|
+
taskFn: (abortController: AbortController) =>
|
|
853
1363
|
this.pipelineReplyTask(
|
|
854
1364
|
handle,
|
|
855
1365
|
chatCtx ?? this.agent.chatCtx,
|
|
856
|
-
|
|
1366
|
+
tools,
|
|
857
1367
|
{
|
|
858
1368
|
toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
|
|
859
1369
|
},
|
|
860
1370
|
abortController,
|
|
861
|
-
instructions
|
|
1371
|
+
instructions,
|
|
862
1372
|
userMessage,
|
|
863
1373
|
),
|
|
864
|
-
),
|
|
865
1374
|
ownedSpeechHandle: handle,
|
|
866
1375
|
name: 'AgentActivity.pipelineReply',
|
|
867
1376
|
});
|
|
868
1377
|
|
|
869
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
1378
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
870
1379
|
}
|
|
871
1380
|
|
|
872
|
-
|
|
1381
|
+
if (scheduleSpeech) {
|
|
1382
|
+
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
1383
|
+
}
|
|
873
1384
|
return handle;
|
|
874
1385
|
}
|
|
875
1386
|
|
|
876
|
-
interrupt(): Future<void> {
|
|
1387
|
+
interrupt(options: { force?: boolean } = {}): Future<void> {
|
|
1388
|
+
const { force = false } = options;
|
|
1389
|
+
this.cancelPreemptiveGeneration();
|
|
1390
|
+
|
|
877
1391
|
const future = new Future<void>();
|
|
878
1392
|
const currentSpeech = this._currentSpeech;
|
|
879
1393
|
|
|
880
1394
|
//TODO(AJS-273): add interrupt for background speeches
|
|
881
1395
|
|
|
882
|
-
currentSpeech?.interrupt();
|
|
1396
|
+
currentSpeech?.interrupt(force);
|
|
883
1397
|
|
|
884
1398
|
for (const [_, __, speech] of this.speechQueue) {
|
|
885
|
-
speech.interrupt();
|
|
1399
|
+
speech.interrupt(force);
|
|
886
1400
|
}
|
|
887
1401
|
|
|
888
1402
|
this.realtimeSession?.interrupt();
|
|
889
1403
|
|
|
890
|
-
if (
|
|
1404
|
+
if (force) {
|
|
1405
|
+
// Force-interrupt (used during shutdown): cancel all speech tasks so they
|
|
1406
|
+
// don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
|
|
1407
|
+
// when the room is disconnected). Mark the current speech as done immediately
|
|
1408
|
+
// so the interrupt future resolves without waiting for tasks to finish.
|
|
1409
|
+
// Clear the queue so mainTask doesn't dequeue already-interrupted handles
|
|
1410
|
+
// and hang on _waitForGeneration() (the generation future created by
|
|
1411
|
+
// _authorizeGeneration would never resolve since _markDone is a no-op
|
|
1412
|
+
// once doneFut is already settled).
|
|
1413
|
+
for (const task of this.speechTasks) {
|
|
1414
|
+
task.cancel();
|
|
1415
|
+
}
|
|
1416
|
+
if (currentSpeech && !currentSpeech.done()) {
|
|
1417
|
+
currentSpeech._markDone();
|
|
1418
|
+
}
|
|
1419
|
+
this.speechQueue.clear();
|
|
1420
|
+
future.resolve();
|
|
1421
|
+
} else if (currentSpeech === undefined) {
|
|
891
1422
|
future.resolve();
|
|
892
1423
|
} else {
|
|
893
1424
|
currentSpeech.addDoneCallback(() => {
|
|
@@ -905,13 +1436,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
905
1436
|
}
|
|
906
1437
|
}
|
|
907
1438
|
|
|
908
|
-
private async userTurnCompleted(info: EndOfTurnInfo, oldTask?:
|
|
1439
|
+
private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
|
|
909
1440
|
if (oldTask) {
|
|
910
1441
|
// We never cancel user code as this is very confusing.
|
|
911
1442
|
// So we wait for the old execution of onUserTurnCompleted to finish.
|
|
912
1443
|
// In practice this is OK because most speeches will be interrupted if a new turn
|
|
913
1444
|
// is detected. So the previous execution should complete quickly.
|
|
914
|
-
await oldTask;
|
|
1445
|
+
await oldTask.result;
|
|
915
1446
|
}
|
|
916
1447
|
|
|
917
1448
|
// When the audio recognition detects the end of a user turn:
|
|
@@ -949,6 +1480,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
949
1480
|
let userMessage: ChatMessage | undefined = ChatMessage.create({
|
|
950
1481
|
role: 'user',
|
|
951
1482
|
content: info.newTranscript,
|
|
1483
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
952
1484
|
});
|
|
953
1485
|
|
|
954
1486
|
// create a temporary mutable chat context to pass to onUserTurnCompleted
|
|
@@ -975,16 +1507,74 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
975
1507
|
return;
|
|
976
1508
|
}
|
|
977
1509
|
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
1510
|
+
const userMetricsReport: MetricsReport = {};
|
|
1511
|
+
if (info.startedSpeakingAt !== undefined) {
|
|
1512
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1000; // ms -> seconds
|
|
1513
|
+
}
|
|
1514
|
+
if (info.stoppedSpeakingAt !== undefined) {
|
|
1515
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1000; // ms -> seconds
|
|
1516
|
+
}
|
|
1517
|
+
if (info.transcriptionDelay !== undefined) {
|
|
1518
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1000; // ms -> seconds
|
|
1519
|
+
}
|
|
1520
|
+
if (info.endOfUtteranceDelay !== undefined) {
|
|
1521
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1000; // ms -> seconds
|
|
1522
|
+
}
|
|
1523
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1000; // ms -> seconds
|
|
1524
|
+
if (userMessage) {
|
|
1525
|
+
userMessage.metrics = userMetricsReport;
|
|
1526
|
+
}
|
|
981
1527
|
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
1528
|
+
let speechHandle: SpeechHandle | undefined;
|
|
1529
|
+
if (this._preemptiveGeneration !== undefined) {
|
|
1530
|
+
const preemptive = this._preemptiveGeneration;
|
|
1531
|
+
// make sure the onUserTurnCompleted didn't change some request parameters
|
|
1532
|
+
// otherwise invalidate the preemptive generation
|
|
1533
|
+
if (
|
|
1534
|
+
preemptive.info.newTranscript === userMessage?.textContent &&
|
|
1535
|
+
preemptive.chatCtx.isEquivalent(chatCtx) &&
|
|
1536
|
+
isSameToolContext(preemptive.tools, this.tools) &&
|
|
1537
|
+
isSameToolChoice(preemptive.toolChoice, this.toolChoice)
|
|
1538
|
+
) {
|
|
1539
|
+
speechHandle = preemptive.speechHandle;
|
|
1540
|
+
// The preemptive userMessage was created without metrics.
|
|
1541
|
+
// Copy the metrics and transcriptConfidence from the new userMessage
|
|
1542
|
+
// to the preemptive message BEFORE scheduling (so the pipeline inserts
|
|
1543
|
+
// the message with metrics already set).
|
|
1544
|
+
if (preemptive.userMessage && userMessage) {
|
|
1545
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1546
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1547
|
+
}
|
|
1548
|
+
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
1549
|
+
this.logger.debug(
|
|
1550
|
+
{
|
|
1551
|
+
preemptiveLeadTime: Date.now() - preemptive.createdAt,
|
|
1552
|
+
},
|
|
1553
|
+
'using preemptive generation',
|
|
1554
|
+
);
|
|
1555
|
+
} else {
|
|
1556
|
+
this.logger.warn(
|
|
1557
|
+
'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
|
|
1558
|
+
);
|
|
1559
|
+
preemptive.speechHandle._cancel();
|
|
1560
|
+
}
|
|
1561
|
+
|
|
1562
|
+
this._preemptiveGeneration = undefined;
|
|
1563
|
+
}
|
|
1564
|
+
|
|
1565
|
+
if (speechHandle === undefined) {
|
|
1566
|
+
// Ensure the new message is passed to generateReply
|
|
1567
|
+
// This preserves the original message id, making it easier for users to track responses
|
|
1568
|
+
speechHandle = this.generateReply({ userMessage, chatCtx });
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
const eouMetrics: EOUMetrics = {
|
|
1572
|
+
type: 'eou_metrics',
|
|
1573
|
+
timestamp: Date.now(),
|
|
1574
|
+
endOfUtteranceDelayMs: info.endOfUtteranceDelay,
|
|
1575
|
+
transcriptionDelayMs: info.transcriptionDelay,
|
|
1576
|
+
onUserTurnCompletedDelayMs: callbackDuration,
|
|
1577
|
+
lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
|
|
988
1578
|
speechId: speechHandle.id,
|
|
989
1579
|
};
|
|
990
1580
|
|
|
@@ -1002,6 +1592,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1002
1592
|
replyAbortController: AbortController,
|
|
1003
1593
|
audio?: ReadableStream<AudioFrame> | null,
|
|
1004
1594
|
): Promise<void> {
|
|
1595
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1596
|
+
|
|
1005
1597
|
speechHandleStorage.enterWith(speechHandle);
|
|
1006
1598
|
|
|
1007
1599
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled
|
|
@@ -1046,28 +1638,44 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1046
1638
|
tasks.push(textForwardTask);
|
|
1047
1639
|
}
|
|
1048
1640
|
|
|
1049
|
-
|
|
1050
|
-
|
|
1641
|
+
let replyStartedSpeakingAt: number | undefined;
|
|
1642
|
+
let replyTtsGenData: _TTSGenerationData | null = null;
|
|
1643
|
+
|
|
1644
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1645
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1646
|
+
this.agentSession._updateAgentState('speaking', {
|
|
1647
|
+
startTime: startedSpeakingAt,
|
|
1648
|
+
otelContext: speechHandle._agentTurnContext,
|
|
1649
|
+
});
|
|
1650
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1651
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1652
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1653
|
+
}
|
|
1051
1654
|
};
|
|
1052
1655
|
|
|
1053
1656
|
if (!audioOutput) {
|
|
1054
1657
|
if (textOut) {
|
|
1055
|
-
textOut.firstTextFut.await
|
|
1658
|
+
textOut.firstTextFut.await
|
|
1659
|
+
.then(() => onFirstFrame())
|
|
1660
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1056
1661
|
}
|
|
1057
1662
|
} else {
|
|
1058
1663
|
let audioOut: _AudioOut | null = null;
|
|
1059
1664
|
if (!audio) {
|
|
1060
1665
|
// generate audio using TTS
|
|
1061
|
-
const [ttsTask,
|
|
1666
|
+
const [ttsTask, ttsGenData] = performTTSInference(
|
|
1062
1667
|
(...args) => this.agent.ttsNode(...args),
|
|
1063
1668
|
audioSource,
|
|
1064
1669
|
modelSettings,
|
|
1065
1670
|
replyAbortController,
|
|
1671
|
+
this.tts?.model,
|
|
1672
|
+
this.tts?.provider,
|
|
1066
1673
|
);
|
|
1067
1674
|
tasks.push(ttsTask);
|
|
1675
|
+
replyTtsGenData = ttsGenData;
|
|
1068
1676
|
|
|
1069
1677
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1070
|
-
|
|
1678
|
+
ttsGenData.audioStream,
|
|
1071
1679
|
audioOutput,
|
|
1072
1680
|
replyAbortController,
|
|
1073
1681
|
);
|
|
@@ -1083,7 +1691,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1083
1691
|
tasks.push(forwardTask);
|
|
1084
1692
|
audioOut = _audioOut;
|
|
1085
1693
|
}
|
|
1086
|
-
audioOut.firstFrameFut.await
|
|
1694
|
+
audioOut.firstFrameFut.await
|
|
1695
|
+
.then((ts) => onFirstFrame(ts))
|
|
1696
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1087
1697
|
}
|
|
1088
1698
|
|
|
1089
1699
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
@@ -1102,10 +1712,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1102
1712
|
}
|
|
1103
1713
|
|
|
1104
1714
|
if (addToChatCtx) {
|
|
1715
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1716
|
+
const replyAssistantMetrics: MetricsReport = {};
|
|
1717
|
+
if (replyTtsGenData?.ttfb !== undefined) {
|
|
1718
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1719
|
+
}
|
|
1720
|
+
if (replyStartedSpeakingAt !== undefined) {
|
|
1721
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds
|
|
1722
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1105
1725
|
const message = ChatMessage.create({
|
|
1106
1726
|
role: 'assistant',
|
|
1107
1727
|
content: textOut?.text || '',
|
|
1108
1728
|
interrupted: speechHandle.interrupted,
|
|
1729
|
+
metrics: replyAssistantMetrics,
|
|
1109
1730
|
});
|
|
1110
1731
|
this.agent._chatCtx.insert(message);
|
|
1111
1732
|
this.agentSession._conversationItemAdded(message);
|
|
@@ -1113,19 +1734,51 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1113
1734
|
|
|
1114
1735
|
if (this.agentSession.agentState === 'speaking') {
|
|
1115
1736
|
this.agentSession._updateAgentState('listening');
|
|
1737
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1738
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1739
|
+
}
|
|
1740
|
+
this.restoreInterruptionByAudioActivity();
|
|
1116
1741
|
}
|
|
1117
1742
|
}
|
|
1118
1743
|
|
|
1119
|
-
private async
|
|
1120
|
-
speechHandle
|
|
1121
|
-
chatCtx
|
|
1122
|
-
toolCtx
|
|
1123
|
-
modelSettings
|
|
1124
|
-
replyAbortController
|
|
1125
|
-
instructions
|
|
1126
|
-
newMessage
|
|
1127
|
-
toolsMessages
|
|
1128
|
-
|
|
1744
|
+
private _pipelineReplyTaskImpl = async ({
|
|
1745
|
+
speechHandle,
|
|
1746
|
+
chatCtx,
|
|
1747
|
+
toolCtx,
|
|
1748
|
+
modelSettings,
|
|
1749
|
+
replyAbortController,
|
|
1750
|
+
instructions,
|
|
1751
|
+
newMessage,
|
|
1752
|
+
toolsMessages,
|
|
1753
|
+
span,
|
|
1754
|
+
_previousUserMetrics,
|
|
1755
|
+
}: {
|
|
1756
|
+
speechHandle: SpeechHandle;
|
|
1757
|
+
chatCtx: ChatContext;
|
|
1758
|
+
toolCtx: ToolContext;
|
|
1759
|
+
modelSettings: ModelSettings;
|
|
1760
|
+
replyAbortController: AbortController;
|
|
1761
|
+
instructions?: string;
|
|
1762
|
+
newMessage?: ChatMessage;
|
|
1763
|
+
toolsMessages?: ChatItem[];
|
|
1764
|
+
span: Span;
|
|
1765
|
+
_previousUserMetrics?: MetricsReport;
|
|
1766
|
+
}): Promise<void> => {
|
|
1767
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1768
|
+
|
|
1769
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1770
|
+
if (instructions) {
|
|
1771
|
+
span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
|
|
1772
|
+
}
|
|
1773
|
+
if (newMessage) {
|
|
1774
|
+
span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
|
|
1775
|
+
}
|
|
1776
|
+
|
|
1777
|
+
const localParticipant = this.agentSession._roomIO?.localParticipant;
|
|
1778
|
+
if (localParticipant) {
|
|
1779
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1780
|
+
}
|
|
1781
|
+
|
|
1129
1782
|
speechHandleStorage.enterWith(speechHandle);
|
|
1130
1783
|
|
|
1131
1784
|
const audioOutput = this.agentSession.output.audioEnabled
|
|
@@ -1137,10 +1790,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1137
1790
|
|
|
1138
1791
|
chatCtx = chatCtx.copy();
|
|
1139
1792
|
|
|
1793
|
+
// Insert new message into temporary chat context for LLM inference
|
|
1140
1794
|
if (newMessage) {
|
|
1141
1795
|
chatCtx.insert(newMessage);
|
|
1142
|
-
this.agent._chatCtx.insert(newMessage);
|
|
1143
|
-
this.agentSession._conversationItemAdded(newMessage);
|
|
1144
1796
|
}
|
|
1145
1797
|
|
|
1146
1798
|
if (instructions) {
|
|
@@ -1155,7 +1807,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1155
1807
|
}
|
|
1156
1808
|
}
|
|
1157
1809
|
|
|
1158
|
-
this.agentSession._updateAgentState('thinking');
|
|
1159
1810
|
const tasks: Array<Task<void>> = [];
|
|
1160
1811
|
const [llmTask, llmGenData] = performLLMInference(
|
|
1161
1812
|
// preserve `this` context in llmNode
|
|
@@ -1164,25 +1815,43 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1164
1815
|
toolCtx,
|
|
1165
1816
|
modelSettings,
|
|
1166
1817
|
replyAbortController,
|
|
1818
|
+
this.llm?.model,
|
|
1819
|
+
this.llm?.provider,
|
|
1167
1820
|
);
|
|
1168
1821
|
tasks.push(llmTask);
|
|
1169
1822
|
|
|
1170
|
-
const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
|
|
1171
|
-
|
|
1172
1823
|
let ttsTask: Task<void> | null = null;
|
|
1173
|
-
let
|
|
1824
|
+
let ttsGenData: _TTSGenerationData | null = null;
|
|
1825
|
+
let llmOutput: ReadableStream<string>;
|
|
1826
|
+
|
|
1174
1827
|
if (audioOutput) {
|
|
1175
|
-
|
|
1828
|
+
// Only tee the stream when we need TTS
|
|
1829
|
+
const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
|
|
1830
|
+
llmOutput = textOutput;
|
|
1831
|
+
[ttsTask, ttsGenData] = performTTSInference(
|
|
1176
1832
|
(...args) => this.agent.ttsNode(...args),
|
|
1177
1833
|
ttsTextInput,
|
|
1178
1834
|
modelSettings,
|
|
1179
1835
|
replyAbortController,
|
|
1836
|
+
this.tts?.model,
|
|
1837
|
+
this.tts?.provider,
|
|
1180
1838
|
);
|
|
1181
1839
|
tasks.push(ttsTask);
|
|
1840
|
+
} else {
|
|
1841
|
+
// No TTS needed, use the stream directly
|
|
1842
|
+
llmOutput = llmGenData.textStream;
|
|
1182
1843
|
}
|
|
1183
1844
|
|
|
1184
1845
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1185
1846
|
|
|
1847
|
+
let userMetrics: MetricsReport | undefined = _previousUserMetrics;
|
|
1848
|
+
// Add new message to actual chat context if the speech is scheduled
|
|
1849
|
+
if (newMessage && speechHandle.scheduled) {
|
|
1850
|
+
this.agent._chatCtx.insert(newMessage);
|
|
1851
|
+
this.agentSession._conversationItemAdded(newMessage);
|
|
1852
|
+
userMetrics = newMessage.metrics;
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1186
1855
|
if (speechHandle.interrupted) {
|
|
1187
1856
|
replyAbortController.abort();
|
|
1188
1857
|
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
@@ -1195,7 +1864,26 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1195
1864
|
speechHandle._clearAuthorization();
|
|
1196
1865
|
|
|
1197
1866
|
const replyStartedAt = Date.now();
|
|
1198
|
-
|
|
1867
|
+
|
|
1868
|
+
// Determine the transcription input source
|
|
1869
|
+
let transcriptionInput: ReadableStream<string | TimedString> = llmOutput;
|
|
1870
|
+
|
|
1871
|
+
// Check if we should use TTS aligned transcripts
|
|
1872
|
+
if (this.useTtsAlignedTranscript && this.tts?.capabilities.alignedTranscript && ttsGenData) {
|
|
1873
|
+
// Race timedTextsFut with ttsTask to avoid hanging if TTS fails before resolving the future
|
|
1874
|
+
const timedTextsStream = await Promise.race([
|
|
1875
|
+
ttsGenData.timedTextsFut.await,
|
|
1876
|
+
ttsTask?.result.catch(() =>
|
|
1877
|
+
this.logger.warn('TTS task failed before resolving timedTextsFut'),
|
|
1878
|
+
) ?? Promise.resolve(),
|
|
1879
|
+
]);
|
|
1880
|
+
if (timedTextsStream) {
|
|
1881
|
+
this.logger.debug('Using TTS aligned transcripts for transcription node input');
|
|
1882
|
+
transcriptionInput = timedTextsStream;
|
|
1883
|
+
}
|
|
1884
|
+
}
|
|
1885
|
+
|
|
1886
|
+
const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
|
|
1199
1887
|
let textOut: _TextOut | null = null;
|
|
1200
1888
|
if (trNodeResult) {
|
|
1201
1889
|
const [textForwardTask, _textOut] = performTextForwarding(
|
|
@@ -1207,37 +1895,54 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1207
1895
|
textOut = _textOut;
|
|
1208
1896
|
}
|
|
1209
1897
|
|
|
1210
|
-
|
|
1211
|
-
|
|
1898
|
+
let agentStartedSpeakingAt: number | undefined;
|
|
1899
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1900
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1901
|
+
this.agentSession._updateAgentState('speaking', {
|
|
1902
|
+
startTime: startedSpeakingAt,
|
|
1903
|
+
otelContext: speechHandle._agentTurnContext,
|
|
1904
|
+
});
|
|
1905
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1906
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1907
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1908
|
+
}
|
|
1212
1909
|
};
|
|
1213
1910
|
|
|
1214
1911
|
let audioOut: _AudioOut | null = null;
|
|
1215
1912
|
if (audioOutput) {
|
|
1216
|
-
if (
|
|
1913
|
+
if (ttsGenData) {
|
|
1217
1914
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1218
|
-
|
|
1915
|
+
ttsGenData.audioStream,
|
|
1219
1916
|
audioOutput,
|
|
1220
1917
|
replyAbortController,
|
|
1221
1918
|
);
|
|
1222
1919
|
audioOut = _audioOut;
|
|
1223
1920
|
tasks.push(forwardTask);
|
|
1224
|
-
audioOut.firstFrameFut.await
|
|
1921
|
+
audioOut.firstFrameFut.await
|
|
1922
|
+
.then((ts) => onFirstFrame(ts))
|
|
1923
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1225
1924
|
} else {
|
|
1226
|
-
throw Error('
|
|
1925
|
+
throw Error('ttsGenData is null when audioOutput is enabled');
|
|
1227
1926
|
}
|
|
1228
1927
|
} else {
|
|
1229
|
-
textOut?.firstTextFut.await
|
|
1928
|
+
textOut?.firstTextFut.await
|
|
1929
|
+
.then(() => onFirstFrame())
|
|
1930
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1230
1931
|
}
|
|
1231
1932
|
|
|
1232
1933
|
//TODO(AJS-272): before executing tools, make sure we generated all the text
|
|
1233
1934
|
// (this ensure everything is kept ordered)
|
|
1234
1935
|
|
|
1235
|
-
const onToolExecutionStarted = (
|
|
1236
|
-
|
|
1936
|
+
const onToolExecutionStarted = (f: FunctionCall) => {
|
|
1937
|
+
speechHandle._itemAdded([f]);
|
|
1938
|
+
this.agent._chatCtx.items.push(f);
|
|
1939
|
+
this.agentSession._toolItemsAdded([f]);
|
|
1237
1940
|
};
|
|
1238
1941
|
|
|
1239
|
-
const onToolExecutionCompleted = (
|
|
1240
|
-
|
|
1942
|
+
const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
|
|
1943
|
+
if (out.toolCallOutput) {
|
|
1944
|
+
speechHandle._itemAdded([out.toolCallOutput]);
|
|
1945
|
+
}
|
|
1241
1946
|
};
|
|
1242
1947
|
|
|
1243
1948
|
const [executeToolsTask, toolOutput] = performToolExecutions({
|
|
@@ -1257,12 +1962,45 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1257
1962
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1258
1963
|
}
|
|
1259
1964
|
|
|
1965
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1966
|
+
const assistantMetrics: MetricsReport = {};
|
|
1967
|
+
|
|
1968
|
+
if (llmGenData.ttft !== undefined) {
|
|
1969
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft; // already in seconds
|
|
1970
|
+
}
|
|
1971
|
+
if (ttsGenData?.ttfb !== undefined) {
|
|
1972
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb; // already in seconds
|
|
1973
|
+
}
|
|
1974
|
+
if (agentStartedSpeakingAt !== undefined) {
|
|
1975
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds
|
|
1976
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds
|
|
1977
|
+
|
|
1978
|
+
if (userMetrics?.stoppedSpeakingAt !== undefined) {
|
|
1979
|
+
const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt;
|
|
1980
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1981
|
+
span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1982
|
+
}
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1986
|
+
let hasSpeechMessage = false;
|
|
1987
|
+
|
|
1260
1988
|
// add the tools messages that triggers this reply to the chat context
|
|
1261
1989
|
if (toolsMessages) {
|
|
1262
1990
|
for (const msg of toolsMessages) {
|
|
1263
1991
|
msg.createdAt = replyStartedAt;
|
|
1264
1992
|
}
|
|
1265
|
-
|
|
1993
|
+
// Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
|
|
1994
|
+
// were already added by onToolExecutionStarted when the tool execution began.
|
|
1995
|
+
// Inserting function_calls again would create duplicates that break provider APIs
|
|
1996
|
+
// (e.g. Google's "function response parts != function call parts" error).
|
|
1997
|
+
const toolCallOutputs = toolsMessages.filter(
|
|
1998
|
+
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1999
|
+
);
|
|
2000
|
+
if (toolCallOutputs.length > 0) {
|
|
2001
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
2002
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
2003
|
+
}
|
|
1266
2004
|
}
|
|
1267
2005
|
|
|
1268
2006
|
if (speechHandle.interrupted) {
|
|
@@ -1270,20 +2008,24 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1270
2008
|
{ speech_id: speechHandle.id },
|
|
1271
2009
|
'Aborting all pipeline reply tasks due to interruption',
|
|
1272
2010
|
);
|
|
2011
|
+
|
|
2012
|
+
// Stop playout ASAP (don't wait for cancellations), otherwise the segment may finish and we
|
|
2013
|
+
// will correctly (but undesirably) commit a long transcript even though the user said "stop".
|
|
2014
|
+
if (audioOutput) {
|
|
2015
|
+
audioOutput.clearBuffer();
|
|
2016
|
+
}
|
|
2017
|
+
|
|
1273
2018
|
replyAbortController.abort();
|
|
1274
|
-
await
|
|
1275
|
-
tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
|
|
1276
|
-
);
|
|
2019
|
+
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1277
2020
|
|
|
1278
2021
|
let forwardedText = textOut?.text || '';
|
|
1279
2022
|
|
|
1280
2023
|
if (audioOutput) {
|
|
1281
|
-
audioOutput.clearBuffer();
|
|
1282
2024
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1283
|
-
if (audioOut?.firstFrameFut.done) {
|
|
2025
|
+
if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
|
|
1284
2026
|
// playback EV is valid only if the first frame was already played
|
|
1285
2027
|
this.logger.info(
|
|
1286
|
-
{ speech_id: speechHandle.id,
|
|
2028
|
+
{ speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
|
|
1287
2029
|
'playout interrupted',
|
|
1288
2030
|
);
|
|
1289
2031
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1295,43 +2037,54 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1295
2037
|
}
|
|
1296
2038
|
|
|
1297
2039
|
if (forwardedText) {
|
|
2040
|
+
hasSpeechMessage = true;
|
|
1298
2041
|
const message = ChatMessage.create({
|
|
1299
2042
|
role: 'assistant',
|
|
1300
2043
|
content: forwardedText,
|
|
1301
2044
|
id: llmGenData.id,
|
|
1302
2045
|
interrupted: true,
|
|
1303
2046
|
createdAt: replyStartedAt,
|
|
2047
|
+
metrics: assistantMetrics,
|
|
1304
2048
|
});
|
|
1305
2049
|
chatCtx.insert(message);
|
|
1306
2050
|
this.agent._chatCtx.insert(message);
|
|
2051
|
+
speechHandle._itemAdded([message]);
|
|
1307
2052
|
this.agentSession._conversationItemAdded(message);
|
|
2053
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
1308
2054
|
}
|
|
1309
2055
|
|
|
1310
2056
|
if (this.agentSession.agentState === 'speaking') {
|
|
1311
2057
|
this.agentSession._updateAgentState('listening');
|
|
2058
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
2059
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
2060
|
+
this.restoreInterruptionByAudioActivity();
|
|
2061
|
+
}
|
|
1312
2062
|
}
|
|
1313
2063
|
|
|
1314
2064
|
this.logger.info(
|
|
1315
2065
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
1316
2066
|
'playout completed with interrupt',
|
|
1317
2067
|
);
|
|
1318
|
-
// TODO(shubhra) add chat message to speech handle
|
|
1319
2068
|
speechHandle._markGenerationDone();
|
|
1320
2069
|
await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1321
2070
|
return;
|
|
1322
2071
|
}
|
|
1323
2072
|
|
|
1324
2073
|
if (textOut && textOut.text) {
|
|
2074
|
+
hasSpeechMessage = true;
|
|
1325
2075
|
const message = ChatMessage.create({
|
|
1326
2076
|
role: 'assistant',
|
|
1327
2077
|
id: llmGenData.id,
|
|
1328
2078
|
interrupted: false,
|
|
1329
2079
|
createdAt: replyStartedAt,
|
|
1330
2080
|
content: textOut.text,
|
|
2081
|
+
metrics: assistantMetrics,
|
|
1331
2082
|
});
|
|
1332
2083
|
chatCtx.insert(message);
|
|
1333
2084
|
this.agent._chatCtx.insert(message);
|
|
2085
|
+
speechHandle._itemAdded([message]);
|
|
1334
2086
|
this.agentSession._conversationItemAdded(message);
|
|
2087
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
1335
2088
|
this.logger.info(
|
|
1336
2089
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
1337
2090
|
'playout completed without interruption',
|
|
@@ -1342,6 +2095,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1342
2095
|
this.agentSession._updateAgentState('thinking');
|
|
1343
2096
|
} else if (this.agentSession.agentState === 'speaking') {
|
|
1344
2097
|
this.agentSession._updateAgentState('listening');
|
|
2098
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
2099
|
+
{
|
|
2100
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
2101
|
+
this.restoreInterruptionByAudioActivity();
|
|
2102
|
+
}
|
|
2103
|
+
}
|
|
1345
2104
|
}
|
|
1346
2105
|
|
|
1347
2106
|
// mark the playout done before waiting for the tool execution
|
|
@@ -1351,7 +2110,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1351
2110
|
if (toolOutput.output.length === 0) return;
|
|
1352
2111
|
|
|
1353
2112
|
// important: no agent output should be used after this point
|
|
1354
|
-
const { maxToolSteps } = this.agentSession.
|
|
2113
|
+
const { maxToolSteps } = this.agentSession.sessionOptions;
|
|
1355
2114
|
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1356
2115
|
this.logger.warn(
|
|
1357
2116
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
@@ -1360,52 +2119,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1360
2119
|
return;
|
|
1361
2120
|
}
|
|
1362
2121
|
|
|
1363
|
-
const functionToolsExecutedEvent =
|
|
1364
|
-
|
|
1365
|
-
functionCallOutputs: [],
|
|
1366
|
-
});
|
|
1367
|
-
let shouldGenerateToolReply: boolean = false;
|
|
1368
|
-
let newAgentTask: Agent | null = null;
|
|
1369
|
-
let ignoreTaskSwitch: boolean = false;
|
|
1370
|
-
|
|
1371
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1372
|
-
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
1373
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1374
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1375
|
-
if (sanitizedOut.replyRequired) {
|
|
1376
|
-
shouldGenerateToolReply = true;
|
|
1377
|
-
}
|
|
1378
|
-
}
|
|
1379
|
-
|
|
1380
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
1381
|
-
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
1382
|
-
ignoreTaskSwitch = true;
|
|
1383
|
-
// TODO(brian): should we mark the function call as failed to notify the LLM?
|
|
1384
|
-
}
|
|
1385
|
-
|
|
1386
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1387
|
-
|
|
1388
|
-
this.logger.debug(
|
|
1389
|
-
{
|
|
1390
|
-
speechId: speechHandle.id,
|
|
1391
|
-
name: sanitizedOut.toolCall?.name,
|
|
1392
|
-
args: sanitizedOut.toolCall.args,
|
|
1393
|
-
output: sanitizedOut.toolCallOutput?.output,
|
|
1394
|
-
isError: sanitizedOut.toolCallOutput?.isError,
|
|
1395
|
-
},
|
|
1396
|
-
'Tool call execution finished',
|
|
1397
|
-
);
|
|
1398
|
-
}
|
|
2122
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
|
|
2123
|
+
this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1399
2124
|
|
|
1400
2125
|
this.agentSession.emit(
|
|
1401
2126
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1402
2127
|
functionToolsExecutedEvent,
|
|
1403
2128
|
);
|
|
1404
2129
|
|
|
1405
|
-
let
|
|
2130
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1406
2131
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1407
2132
|
this.agentSession.updateAgent(newAgentTask);
|
|
1408
|
-
|
|
2133
|
+
schedulingPaused = true;
|
|
1409
2134
|
}
|
|
1410
2135
|
|
|
1411
2136
|
const toolMessages = [
|
|
@@ -1415,28 +2140,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1415
2140
|
if (shouldGenerateToolReply) {
|
|
1416
2141
|
chatCtx.insert(toolMessages);
|
|
1417
2142
|
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
stepIndex: speechHandle._stepIndex + 1,
|
|
1421
|
-
parent: speechHandle,
|
|
1422
|
-
});
|
|
1423
|
-
this.agentSession.emit(
|
|
1424
|
-
AgentSessionEventTypes.SpeechCreated,
|
|
1425
|
-
createSpeechCreatedEvent({
|
|
1426
|
-
userInitiated: false,
|
|
1427
|
-
source: 'tool_response',
|
|
1428
|
-
speechHandle: handle,
|
|
1429
|
-
}),
|
|
1430
|
-
);
|
|
2143
|
+
// Increment step count on SAME handle (parity with Python agent_activity.py L2081)
|
|
2144
|
+
speechHandle._numSteps += 1;
|
|
1431
2145
|
|
|
1432
2146
|
// Avoid setting tool_choice to "required" or a specific function when
|
|
1433
2147
|
// passing tool response back to the LLM
|
|
1434
|
-
const respondToolChoice =
|
|
2148
|
+
const respondToolChoice =
|
|
2149
|
+
schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1435
2150
|
|
|
2151
|
+
// Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
|
|
1436
2152
|
const toolResponseTask = this.createSpeechTask({
|
|
1437
|
-
|
|
2153
|
+
taskFn: () =>
|
|
1438
2154
|
this.pipelineReplyTask(
|
|
1439
|
-
|
|
2155
|
+
speechHandle,
|
|
1440
2156
|
chatCtx,
|
|
1441
2157
|
toolCtx,
|
|
1442
2158
|
{ toolChoice: respondToolChoice },
|
|
@@ -1444,22 +2160,61 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1444
2160
|
instructions,
|
|
1445
2161
|
undefined,
|
|
1446
2162
|
toolMessages,
|
|
2163
|
+
hasSpeechMessage ? undefined : userMetrics,
|
|
1447
2164
|
),
|
|
1448
|
-
|
|
1449
|
-
ownedSpeechHandle: handle,
|
|
2165
|
+
ownedSpeechHandle: speechHandle,
|
|
1450
2166
|
name: 'AgentActivity.pipelineReply',
|
|
1451
2167
|
});
|
|
1452
2168
|
|
|
1453
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
2169
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1454
2170
|
|
|
1455
|
-
this.scheduleSpeech(
|
|
2171
|
+
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1456
2172
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1457
2173
|
for (const msg of toolMessages) {
|
|
1458
2174
|
msg.createdAt = replyStartedAt;
|
|
1459
2175
|
}
|
|
1460
|
-
|
|
2176
|
+
|
|
2177
|
+
const toolCallOutputs = toolMessages.filter(
|
|
2178
|
+
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
2179
|
+
);
|
|
2180
|
+
|
|
2181
|
+
if (toolCallOutputs.length > 0) {
|
|
2182
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
2183
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
2184
|
+
}
|
|
1461
2185
|
}
|
|
1462
|
-
}
|
|
2186
|
+
};
|
|
2187
|
+
|
|
2188
|
+
private pipelineReplyTask = async (
|
|
2189
|
+
speechHandle: SpeechHandle,
|
|
2190
|
+
chatCtx: ChatContext,
|
|
2191
|
+
toolCtx: ToolContext,
|
|
2192
|
+
modelSettings: ModelSettings,
|
|
2193
|
+
replyAbortController: AbortController,
|
|
2194
|
+
instructions?: string,
|
|
2195
|
+
newMessage?: ChatMessage,
|
|
2196
|
+
toolsMessages?: ChatItem[],
|
|
2197
|
+
_previousUserMetrics?: MetricsReport,
|
|
2198
|
+
): Promise<void> =>
|
|
2199
|
+
tracer.startActiveSpan(
|
|
2200
|
+
async (span) =>
|
|
2201
|
+
this._pipelineReplyTaskImpl({
|
|
2202
|
+
speechHandle,
|
|
2203
|
+
chatCtx,
|
|
2204
|
+
toolCtx,
|
|
2205
|
+
modelSettings,
|
|
2206
|
+
replyAbortController,
|
|
2207
|
+
instructions,
|
|
2208
|
+
newMessage,
|
|
2209
|
+
toolsMessages,
|
|
2210
|
+
span,
|
|
2211
|
+
_previousUserMetrics,
|
|
2212
|
+
}),
|
|
2213
|
+
{
|
|
2214
|
+
name: 'agent_turn',
|
|
2215
|
+
context: this.agentSession.rootSpanContext,
|
|
2216
|
+
},
|
|
2217
|
+
);
|
|
1463
2218
|
|
|
1464
2219
|
private async realtimeGenerationTask(
|
|
1465
2220
|
speechHandle: SpeechHandle,
|
|
@@ -1467,6 +2222,44 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1467
2222
|
modelSettings: ModelSettings,
|
|
1468
2223
|
replyAbortController: AbortController,
|
|
1469
2224
|
): Promise<void> {
|
|
2225
|
+
return tracer.startActiveSpan(
|
|
2226
|
+
async (span) =>
|
|
2227
|
+
this._realtimeGenerationTaskImpl({
|
|
2228
|
+
speechHandle,
|
|
2229
|
+
ev,
|
|
2230
|
+
modelSettings,
|
|
2231
|
+
replyAbortController,
|
|
2232
|
+
span,
|
|
2233
|
+
}),
|
|
2234
|
+
{
|
|
2235
|
+
name: 'agent_turn',
|
|
2236
|
+
context: this.agentSession.rootSpanContext,
|
|
2237
|
+
},
|
|
2238
|
+
);
|
|
2239
|
+
}
|
|
2240
|
+
|
|
2241
|
+
private async _realtimeGenerationTaskImpl({
|
|
2242
|
+
speechHandle,
|
|
2243
|
+
ev,
|
|
2244
|
+
modelSettings,
|
|
2245
|
+
replyAbortController,
|
|
2246
|
+
span,
|
|
2247
|
+
}: {
|
|
2248
|
+
speechHandle: SpeechHandle;
|
|
2249
|
+
ev: GenerationCreatedEvent;
|
|
2250
|
+
modelSettings: ModelSettings;
|
|
2251
|
+
replyAbortController: AbortController;
|
|
2252
|
+
span: Span;
|
|
2253
|
+
}): Promise<void> {
|
|
2254
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
2255
|
+
|
|
2256
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
2257
|
+
|
|
2258
|
+
const localParticipant = this.agentSession._roomIO?.localParticipant;
|
|
2259
|
+
if (localParticipant) {
|
|
2260
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
2261
|
+
}
|
|
2262
|
+
|
|
1470
2263
|
speechHandleStorage.enterWith(speechHandle);
|
|
1471
2264
|
|
|
1472
2265
|
if (!this.realtimeSession) {
|
|
@@ -1476,6 +2269,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1476
2269
|
throw new Error('llm is not a realtime model');
|
|
1477
2270
|
}
|
|
1478
2271
|
|
|
2272
|
+
// Store span for metrics recording when they arrive later
|
|
2273
|
+
span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.llm.model);
|
|
2274
|
+
if (this.realtimeSpans && ev.responseId) {
|
|
2275
|
+
this.realtimeSpans.set(ev.responseId, span);
|
|
2276
|
+
}
|
|
2277
|
+
|
|
1479
2278
|
this.logger.debug(
|
|
1480
2279
|
{ speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
|
|
1481
2280
|
'realtime generation started',
|
|
@@ -1496,14 +2295,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1496
2295
|
return;
|
|
1497
2296
|
}
|
|
1498
2297
|
|
|
1499
|
-
const onFirstFrame = () => {
|
|
1500
|
-
this.agentSession._updateAgentState('speaking'
|
|
2298
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
2299
|
+
this.agentSession._updateAgentState('speaking', {
|
|
2300
|
+
startTime: startedSpeakingAt,
|
|
2301
|
+
otelContext: speechHandle._agentTurnContext,
|
|
2302
|
+
});
|
|
1501
2303
|
};
|
|
1502
2304
|
|
|
1503
2305
|
const readMessages = async (
|
|
1504
2306
|
abortController: AbortController,
|
|
1505
|
-
outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
|
|
2307
|
+
outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
|
|
1506
2308
|
) => {
|
|
2309
|
+
replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
|
|
2310
|
+
once: true,
|
|
2311
|
+
});
|
|
2312
|
+
|
|
1507
2313
|
const forwardTasks: Array<Task<void>> = [];
|
|
1508
2314
|
try {
|
|
1509
2315
|
for await (const msg of ev.messageStream) {
|
|
@@ -1513,7 +2319,25 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1513
2319
|
);
|
|
1514
2320
|
break;
|
|
1515
2321
|
}
|
|
1516
|
-
|
|
2322
|
+
|
|
2323
|
+
const msgModalities = msg.modalities ? await msg.modalities : undefined;
|
|
2324
|
+
let ttsTextInput: ReadableStream<string | TimedString> | null = null;
|
|
2325
|
+
let trTextInput: ReadableStream<string | TimedString>;
|
|
2326
|
+
|
|
2327
|
+
if (msgModalities && !msgModalities.includes('audio') && this.tts) {
|
|
2328
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
2329
|
+
this.logger.warn(
|
|
2330
|
+
'text response received from realtime API, falling back to use a TTS model.',
|
|
2331
|
+
);
|
|
2332
|
+
}
|
|
2333
|
+
const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
|
|
2334
|
+
ttsTextInput = _ttsTextInput;
|
|
2335
|
+
trTextInput = _trTextInput;
|
|
2336
|
+
} else {
|
|
2337
|
+
trTextInput = msg.textStream;
|
|
2338
|
+
}
|
|
2339
|
+
|
|
2340
|
+
const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
|
|
1517
2341
|
let textOut: _TextOut | null = null;
|
|
1518
2342
|
if (trNodeResult) {
|
|
1519
2343
|
const [textForwardTask, _textOut] = performTextForwarding(
|
|
@@ -1524,30 +2348,57 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1524
2348
|
forwardTasks.push(textForwardTask);
|
|
1525
2349
|
textOut = _textOut;
|
|
1526
2350
|
}
|
|
2351
|
+
|
|
1527
2352
|
let audioOut: _AudioOut | null = null;
|
|
1528
2353
|
if (audioOutput) {
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
audioOutput,
|
|
2354
|
+
let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
|
|
2355
|
+
|
|
2356
|
+
if (ttsTextInput) {
|
|
2357
|
+
const [ttsTask, ttsGenData] = performTTSInference(
|
|
2358
|
+
(...args) => this.agent.ttsNode(...args),
|
|
2359
|
+
ttsTextInput,
|
|
2360
|
+
modelSettings,
|
|
1537
2361
|
abortController,
|
|
2362
|
+
this.tts?.model,
|
|
2363
|
+
this.tts?.provider,
|
|
2364
|
+
);
|
|
2365
|
+
tasks.push(ttsTask);
|
|
2366
|
+
realtimeAudioResult = ttsGenData.audioStream;
|
|
2367
|
+
} else if (msgModalities && msgModalities.includes('audio')) {
|
|
2368
|
+
realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
|
|
2369
|
+
msg.audioStream,
|
|
2370
|
+
modelSettings,
|
|
2371
|
+
);
|
|
2372
|
+
} else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
2373
|
+
this.logger.error(
|
|
2374
|
+
'Text message received from Realtime API with audio modality. ' +
|
|
2375
|
+
'This usually happens when text chat context is synced to the API. ' +
|
|
2376
|
+
'Try to add a TTS model as fallback or use text modality with TTS instead.',
|
|
1538
2377
|
);
|
|
1539
|
-
forwardTasks.push(forwardTask);
|
|
1540
|
-
audioOut = _audioOut;
|
|
1541
|
-
audioOut.firstFrameFut.await.finally(onFirstFrame);
|
|
1542
2378
|
} else {
|
|
1543
2379
|
this.logger.warn(
|
|
1544
2380
|
'audio output is enabled but neither tts nor realtime audio is available',
|
|
1545
2381
|
);
|
|
1546
2382
|
}
|
|
2383
|
+
|
|
2384
|
+
if (realtimeAudioResult) {
|
|
2385
|
+
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
2386
|
+
realtimeAudioResult,
|
|
2387
|
+
audioOutput,
|
|
2388
|
+
abortController,
|
|
2389
|
+
);
|
|
2390
|
+
forwardTasks.push(forwardTask);
|
|
2391
|
+
audioOut = _audioOut;
|
|
2392
|
+
audioOut.firstFrameFut.await
|
|
2393
|
+
.then((ts) => onFirstFrame(ts))
|
|
2394
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
2395
|
+
}
|
|
1547
2396
|
} else if (textOut) {
|
|
1548
|
-
textOut.firstTextFut.await
|
|
2397
|
+
textOut.firstTextFut.await
|
|
2398
|
+
.then(() => onFirstFrame())
|
|
2399
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1549
2400
|
}
|
|
1550
|
-
outputs.push([msg.messageId, textOut, audioOut]);
|
|
2401
|
+
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1551
2402
|
}
|
|
1552
2403
|
await waitFor(forwardTasks);
|
|
1553
2404
|
} catch (error) {
|
|
@@ -1557,11 +2408,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1557
2408
|
}
|
|
1558
2409
|
};
|
|
1559
2410
|
|
|
1560
|
-
const messageOutputs: Array<
|
|
2411
|
+
const messageOutputs: Array<
|
|
2412
|
+
[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
|
|
2413
|
+
> = [];
|
|
1561
2414
|
const tasks = [
|
|
1562
2415
|
Task.from(
|
|
1563
2416
|
(controller) => readMessages(controller, messageOutputs),
|
|
1564
|
-
|
|
2417
|
+
undefined,
|
|
1565
2418
|
'AgentActivity.realtime_generation.read_messages',
|
|
1566
2419
|
),
|
|
1567
2420
|
];
|
|
@@ -1598,6 +2451,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1598
2451
|
|
|
1599
2452
|
const onToolExecutionStarted = (f: FunctionCall) => {
|
|
1600
2453
|
speechHandle._itemAdded([f]);
|
|
2454
|
+
this.agent._chatCtx.items.push(f);
|
|
2455
|
+
this.agentSession._toolItemsAdded([f]);
|
|
1601
2456
|
};
|
|
1602
2457
|
|
|
1603
2458
|
const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
|
|
@@ -1623,7 +2478,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1623
2478
|
|
|
1624
2479
|
if (audioOutput) {
|
|
1625
2480
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1626
|
-
this.agentSession._updateAgentState('listening');
|
|
1627
2481
|
}
|
|
1628
2482
|
|
|
1629
2483
|
if (speechHandle.interrupted) {
|
|
@@ -1636,17 +2490,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1636
2490
|
|
|
1637
2491
|
if (messageOutputs.length > 0) {
|
|
1638
2492
|
// there should be only one message
|
|
1639
|
-
const [msgId, textOut, audioOut] = messageOutputs[0]!;
|
|
2493
|
+
const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
|
|
1640
2494
|
let forwardedText = textOut?.text || '';
|
|
1641
2495
|
|
|
1642
2496
|
if (audioOutput) {
|
|
1643
2497
|
audioOutput.clearBuffer();
|
|
1644
2498
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1645
|
-
let
|
|
1646
|
-
if (audioOut?.firstFrameFut.done) {
|
|
2499
|
+
let playbackPositionInS = playbackEv.playbackPosition;
|
|
2500
|
+
if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
|
|
1647
2501
|
// playback EV is valid only if the first frame was already played
|
|
1648
2502
|
this.logger.info(
|
|
1649
|
-
{ speech_id: speechHandle.id,
|
|
2503
|
+
{ speech_id: speechHandle.id, playbackPositionInS },
|
|
1650
2504
|
'playout interrupted',
|
|
1651
2505
|
);
|
|
1652
2506
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1654,13 +2508,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1654
2508
|
}
|
|
1655
2509
|
} else {
|
|
1656
2510
|
forwardedText = '';
|
|
1657
|
-
|
|
2511
|
+
playbackPositionInS = 0;
|
|
1658
2512
|
}
|
|
1659
2513
|
|
|
1660
2514
|
// truncate server-side message
|
|
1661
2515
|
this.realtimeSession.truncate({
|
|
1662
2516
|
messageId: msgId,
|
|
1663
|
-
audioEndMs: Math.floor(
|
|
2517
|
+
audioEndMs: Math.floor(playbackPositionInS * 1000),
|
|
2518
|
+
modalities: msgModalities,
|
|
2519
|
+
audioTranscript: forwardedText,
|
|
1664
2520
|
});
|
|
1665
2521
|
}
|
|
1666
2522
|
|
|
@@ -1691,7 +2547,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1691
2547
|
|
|
1692
2548
|
if (messageOutputs.length > 0) {
|
|
1693
2549
|
// there should be only one message
|
|
1694
|
-
const [msgId, textOut, _] = messageOutputs[0]!;
|
|
2550
|
+
const [msgId, textOut, _, __] = messageOutputs[0]!;
|
|
1695
2551
|
const message = ChatMessage.create({
|
|
1696
2552
|
role: 'assistant',
|
|
1697
2553
|
content: textOut?.text || '',
|
|
@@ -1708,16 +2564,20 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1708
2564
|
speechHandle._markGenerationDone();
|
|
1709
2565
|
// TODO(brian): close tees
|
|
1710
2566
|
|
|
1711
|
-
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
1712
|
-
this.agentSession._updateAgentState('thinking');
|
|
1713
|
-
});
|
|
1714
|
-
|
|
1715
2567
|
await executeToolsTask.result;
|
|
1716
2568
|
|
|
1717
|
-
if (toolOutput.output.length
|
|
2569
|
+
if (toolOutput.output.length > 0) {
|
|
2570
|
+
this.agentSession._updateAgentState('thinking');
|
|
2571
|
+
} else if (this.agentSession.agentState === 'speaking') {
|
|
2572
|
+
this.agentSession._updateAgentState('listening');
|
|
2573
|
+
}
|
|
2574
|
+
|
|
2575
|
+
if (toolOutput.output.length === 0) {
|
|
2576
|
+
return;
|
|
2577
|
+
}
|
|
1718
2578
|
|
|
1719
2579
|
// important: no agent ouput should be used after this point
|
|
1720
|
-
const { maxToolSteps } = this.agentSession.
|
|
2580
|
+
const { maxToolSteps } = this.agentSession.sessionOptions;
|
|
1721
2581
|
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1722
2582
|
this.logger.warn(
|
|
1723
2583
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
@@ -1726,55 +2586,42 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1726
2586
|
return;
|
|
1727
2587
|
}
|
|
1728
2588
|
|
|
1729
|
-
const functionToolsExecutedEvent =
|
|
1730
|
-
|
|
1731
|
-
functionCallOutputs: [],
|
|
1732
|
-
});
|
|
1733
|
-
let shouldGenerateToolReply: boolean = false;
|
|
1734
|
-
let newAgentTask: Agent | null = null;
|
|
1735
|
-
let ignoreTaskSwitch: boolean = false;
|
|
1736
|
-
|
|
1737
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1738
|
-
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
1739
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1740
|
-
if (sanitizedOut.replyRequired) {
|
|
1741
|
-
shouldGenerateToolReply = true;
|
|
1742
|
-
}
|
|
1743
|
-
}
|
|
1744
|
-
|
|
1745
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
1746
|
-
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
1747
|
-
ignoreTaskSwitch = true;
|
|
1748
|
-
}
|
|
1749
|
-
|
|
1750
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1751
|
-
|
|
1752
|
-
this.logger.debug(
|
|
1753
|
-
{
|
|
1754
|
-
speechId: speechHandle.id,
|
|
1755
|
-
name: sanitizedOut.toolCall?.name,
|
|
1756
|
-
args: sanitizedOut.toolCall.args,
|
|
1757
|
-
output: sanitizedOut.toolCallOutput?.output,
|
|
1758
|
-
isError: sanitizedOut.toolCallOutput?.isError,
|
|
1759
|
-
},
|
|
1760
|
-
'Tool call execution finished',
|
|
1761
|
-
);
|
|
1762
|
-
}
|
|
2589
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
|
|
2590
|
+
this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1763
2591
|
|
|
1764
2592
|
this.agentSession.emit(
|
|
1765
2593
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1766
2594
|
functionToolsExecutedEvent,
|
|
1767
2595
|
);
|
|
1768
2596
|
|
|
1769
|
-
let
|
|
2597
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1770
2598
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1771
2599
|
this.agentSession.updateAgent(newAgentTask);
|
|
1772
|
-
|
|
2600
|
+
schedulingPaused = true;
|
|
1773
2601
|
}
|
|
1774
2602
|
|
|
1775
2603
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
2604
|
+
// wait all speeches played before updating the tool output and generating the response
|
|
2605
|
+
// most realtime models dont support generating multiple responses at the same time
|
|
2606
|
+
while (this.currentSpeech || this.speechQueue.size() > 0) {
|
|
2607
|
+
if (
|
|
2608
|
+
this.currentSpeech &&
|
|
2609
|
+
!this.currentSpeech.done() &&
|
|
2610
|
+
this.currentSpeech !== speechHandle
|
|
2611
|
+
) {
|
|
2612
|
+
await this.currentSpeech.waitForPlayout();
|
|
2613
|
+
} else {
|
|
2614
|
+
// Don't block the event loop
|
|
2615
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
2616
|
+
}
|
|
2617
|
+
}
|
|
1776
2618
|
const chatCtx = this.realtimeSession.chatCtx.copy();
|
|
1777
2619
|
chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
|
|
2620
|
+
|
|
2621
|
+
this.agentSession._toolItemsAdded(
|
|
2622
|
+
functionToolsExecutedEvent.functionCallOutputs as FunctionCallOutput[],
|
|
2623
|
+
);
|
|
2624
|
+
|
|
1778
2625
|
try {
|
|
1779
2626
|
await this.realtimeSession.updateChatCtx(chatCtx);
|
|
1780
2627
|
} catch (error) {
|
|
@@ -1806,15 +2653,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1806
2653
|
}),
|
|
1807
2654
|
);
|
|
1808
2655
|
|
|
1809
|
-
const toolChoice =
|
|
2656
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1810
2657
|
this.createSpeechTask({
|
|
1811
|
-
|
|
2658
|
+
taskFn: (abortController: AbortController) =>
|
|
1812
2659
|
this.realtimeReplyTask({
|
|
1813
2660
|
speechHandle: replySpeechHandle,
|
|
1814
2661
|
modelSettings: { toolChoice },
|
|
1815
2662
|
abortController,
|
|
1816
2663
|
}),
|
|
1817
|
-
),
|
|
1818
2664
|
ownedSpeechHandle: replySpeechHandle,
|
|
1819
2665
|
name: 'AgentActivity.realtime_reply',
|
|
1820
2666
|
});
|
|
@@ -1822,6 +2668,53 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1822
2668
|
this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1823
2669
|
}
|
|
1824
2670
|
|
|
2671
|
+
private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
|
|
2672
|
+
const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
|
|
2673
|
+
functionCalls: [],
|
|
2674
|
+
functionCallOutputs: [],
|
|
2675
|
+
});
|
|
2676
|
+
|
|
2677
|
+
let shouldGenerateToolReply = false;
|
|
2678
|
+
let newAgentTask: Agent | null = null;
|
|
2679
|
+
let ignoreTaskSwitch = false;
|
|
2680
|
+
|
|
2681
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
2682
|
+
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
2683
|
+
// Keep event payload symmetric for pipeline + realtime paths.
|
|
2684
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
2685
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
2686
|
+
if (sanitizedOut.replyRequired) {
|
|
2687
|
+
shouldGenerateToolReply = true;
|
|
2688
|
+
}
|
|
2689
|
+
}
|
|
2690
|
+
|
|
2691
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
2692
|
+
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
2693
|
+
ignoreTaskSwitch = true;
|
|
2694
|
+
}
|
|
2695
|
+
|
|
2696
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
2697
|
+
|
|
2698
|
+
this.logger.debug(
|
|
2699
|
+
{
|
|
2700
|
+
speechId: speechHandle.id,
|
|
2701
|
+
name: sanitizedOut.toolCall?.name,
|
|
2702
|
+
args: sanitizedOut.toolCall.args,
|
|
2703
|
+
output: sanitizedOut.toolCallOutput?.output,
|
|
2704
|
+
isError: sanitizedOut.toolCallOutput?.isError,
|
|
2705
|
+
},
|
|
2706
|
+
'Tool call execution finished',
|
|
2707
|
+
);
|
|
2708
|
+
}
|
|
2709
|
+
|
|
2710
|
+
return {
|
|
2711
|
+
functionToolsExecutedEvent,
|
|
2712
|
+
shouldGenerateToolReply,
|
|
2713
|
+
newAgentTask,
|
|
2714
|
+
ignoreTaskSwitch,
|
|
2715
|
+
};
|
|
2716
|
+
}
|
|
2717
|
+
|
|
1825
2718
|
private async realtimeReplyTask({
|
|
1826
2719
|
speechHandle,
|
|
1827
2720
|
modelSettings: { toolChoice },
|
|
@@ -1880,10 +2773,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1880
2773
|
priority: number,
|
|
1881
2774
|
force: boolean = false,
|
|
1882
2775
|
): void {
|
|
1883
|
-
// when force=true, we allow tool responses to bypass
|
|
2776
|
+
// when force=true, we allow tool responses to bypass scheduling pause
|
|
1884
2777
|
// This allows for tool responses to be generated before the AgentActivity is finalized
|
|
1885
|
-
if (this.
|
|
1886
|
-
throw new Error('cannot schedule new speech, the
|
|
2778
|
+
if (this.schedulingPaused && !force) {
|
|
2779
|
+
throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
|
|
1887
2780
|
}
|
|
1888
2781
|
|
|
1889
2782
|
// Monotonic time to avoid near 0 collisions
|
|
@@ -1892,19 +2785,77 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1892
2785
|
this.wakeupMainTask();
|
|
1893
2786
|
}
|
|
1894
2787
|
|
|
2788
|
+
private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
|
|
2789
|
+
if (this._schedulingPaused) return;
|
|
2790
|
+
|
|
2791
|
+
this._schedulingPaused = true;
|
|
2792
|
+
this._drainBlockedTasks = blockedTasks;
|
|
2793
|
+
this.wakeupMainTask();
|
|
2794
|
+
|
|
2795
|
+
if (this._mainTask) {
|
|
2796
|
+
// When pausing/draining, we ensure that all speech_tasks complete fully.
|
|
2797
|
+
// This means that even if the SpeechHandle themselves have finished,
|
|
2798
|
+
// we still wait for the entire execution (e.g function_tools)
|
|
2799
|
+
await this._mainTask.result;
|
|
2800
|
+
}
|
|
2801
|
+
}
|
|
2802
|
+
|
|
2803
|
+
private _resumeSchedulingTask(): void {
|
|
2804
|
+
if (!this._schedulingPaused) return;
|
|
2805
|
+
|
|
2806
|
+
this._schedulingPaused = false;
|
|
2807
|
+
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
2808
|
+
}
|
|
2809
|
+
|
|
2810
|
+
async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
|
|
2811
|
+
const { blockedTasks = [] } = options;
|
|
2812
|
+
const unlock = await this.lock.lock();
|
|
2813
|
+
|
|
2814
|
+
try {
|
|
2815
|
+
const span = tracer.startSpan({
|
|
2816
|
+
name: 'pause_agent_activity',
|
|
2817
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
2818
|
+
});
|
|
2819
|
+
try {
|
|
2820
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
2821
|
+
await this._closeSessionResources();
|
|
2822
|
+
} finally {
|
|
2823
|
+
span.end();
|
|
2824
|
+
}
|
|
2825
|
+
} finally {
|
|
2826
|
+
unlock();
|
|
2827
|
+
}
|
|
2828
|
+
}
|
|
2829
|
+
|
|
1895
2830
|
async drain(): Promise<void> {
|
|
2831
|
+
// Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
|
|
2832
|
+
return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
2833
|
+
name: 'drain_agent_activity',
|
|
2834
|
+
context: ROOT_CONTEXT,
|
|
2835
|
+
});
|
|
2836
|
+
}
|
|
2837
|
+
|
|
2838
|
+
private async _drainImpl(span: Span): Promise<void> {
|
|
2839
|
+
span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
|
|
2840
|
+
|
|
1896
2841
|
const unlock = await this.lock.lock();
|
|
1897
2842
|
try {
|
|
1898
|
-
if (this.
|
|
2843
|
+
if (this._schedulingPaused) return;
|
|
1899
2844
|
|
|
1900
|
-
this.createSpeechTask({
|
|
1901
|
-
|
|
2845
|
+
this._onExitTask = this.createSpeechTask({
|
|
2846
|
+
taskFn: () =>
|
|
2847
|
+
tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
2848
|
+
name: 'on_exit',
|
|
2849
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
2850
|
+
}),
|
|
2851
|
+
inlineTask: true,
|
|
1902
2852
|
name: 'AgentActivity_onExit',
|
|
1903
2853
|
});
|
|
1904
2854
|
|
|
1905
|
-
this.
|
|
1906
|
-
|
|
1907
|
-
await this.
|
|
2855
|
+
this.cancelPreemptiveGeneration();
|
|
2856
|
+
|
|
2857
|
+
await this._onExitTask.result;
|
|
2858
|
+
await this._pauseSchedulingTask([]);
|
|
1908
2859
|
} finally {
|
|
1909
2860
|
unlock();
|
|
1910
2861
|
}
|
|
@@ -1913,42 +2864,160 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1913
2864
|
async close(): Promise<void> {
|
|
1914
2865
|
const unlock = await this.lock.lock();
|
|
1915
2866
|
try {
|
|
1916
|
-
|
|
1917
|
-
this.logger.warn('task closing without draining');
|
|
1918
|
-
}
|
|
2867
|
+
this.cancelPreemptiveGeneration();
|
|
1919
2868
|
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
if (this.realtimeSession) {
|
|
1925
|
-
this.realtimeSession.off('generation_created', this.onGenerationCreated);
|
|
1926
|
-
this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
|
|
1927
|
-
this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
|
|
1928
|
-
this.realtimeSession.off(
|
|
1929
|
-
'input_audio_transcription_completed',
|
|
1930
|
-
this.onInputAudioTranscriptionCompleted,
|
|
1931
|
-
);
|
|
1932
|
-
this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
|
|
1933
|
-
}
|
|
1934
|
-
if (this.stt instanceof STT) {
|
|
1935
|
-
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2869
|
+
await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
2870
|
+
|
|
2871
|
+
if (this._currentSpeech && !this._currentSpeech.done()) {
|
|
2872
|
+
this._currentSpeech._markDone();
|
|
1936
2873
|
}
|
|
1937
|
-
|
|
1938
|
-
|
|
2874
|
+
|
|
2875
|
+
await this._closeSessionResources();
|
|
2876
|
+
|
|
2877
|
+
if (this._mainTask) {
|
|
2878
|
+
await this._mainTask.cancelAndWait();
|
|
1939
2879
|
}
|
|
1940
|
-
if (this.
|
|
1941
|
-
this.
|
|
2880
|
+
if (this.interruptionDetector) {
|
|
2881
|
+
this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
|
|
2882
|
+
this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2883
|
+
this.interruptionDetector.off('error', this.onInterruptionError);
|
|
1942
2884
|
}
|
|
1943
2885
|
|
|
1944
|
-
this.
|
|
1945
|
-
await this.realtimeSession?.close();
|
|
1946
|
-
await this.audioRecognition?.close();
|
|
1947
|
-
await this._mainTask?.cancelAndWait();
|
|
2886
|
+
this.agent._agentActivity = undefined;
|
|
1948
2887
|
} finally {
|
|
1949
2888
|
unlock();
|
|
1950
2889
|
}
|
|
1951
2890
|
}
|
|
2891
|
+
|
|
2892
|
+
private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
|
|
2893
|
+
const agentInterruptionDetection = this.agent.turnHandling?.interruption?.mode;
|
|
2894
|
+
const sessionInterruptionDetection = this.agentSession.interruptionDetection;
|
|
2895
|
+
if (
|
|
2896
|
+
!(
|
|
2897
|
+
this.stt &&
|
|
2898
|
+
this.stt.capabilities.alignedTranscript &&
|
|
2899
|
+
this.stt.capabilities.streaming &&
|
|
2900
|
+
this.vad &&
|
|
2901
|
+
this.turnDetection !== 'manual' &&
|
|
2902
|
+
this.turnDetection !== 'realtime_llm' &&
|
|
2903
|
+
!(this.llm instanceof RealtimeModel)
|
|
2904
|
+
)
|
|
2905
|
+
) {
|
|
2906
|
+
if (
|
|
2907
|
+
agentInterruptionDetection === 'adaptive' ||
|
|
2908
|
+
sessionInterruptionDetection === 'adaptive'
|
|
2909
|
+
) {
|
|
2910
|
+
this.logger.warn(
|
|
2911
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
|
|
2912
|
+
);
|
|
2913
|
+
}
|
|
2914
|
+
return undefined;
|
|
2915
|
+
}
|
|
2916
|
+
|
|
2917
|
+
if (!this.allowInterruptions) {
|
|
2918
|
+
return undefined;
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2921
|
+
if (agentInterruptionDetection === 'vad') {
|
|
2922
|
+
return undefined;
|
|
2923
|
+
}
|
|
2924
|
+
|
|
2925
|
+
if (sessionInterruptionDetection === 'vad') {
|
|
2926
|
+
return undefined;
|
|
2927
|
+
}
|
|
2928
|
+
|
|
2929
|
+
if (
|
|
2930
|
+
agentInterruptionDetection === undefined &&
|
|
2931
|
+
sessionInterruptionDetection === undefined &&
|
|
2932
|
+
!isHosted() &&
|
|
2933
|
+
!isDevMode()
|
|
2934
|
+
) {
|
|
2935
|
+
this.logger.info('adaptive interruption is disabled by default in production mode');
|
|
2936
|
+
return undefined;
|
|
2937
|
+
}
|
|
2938
|
+
|
|
2939
|
+
try {
|
|
2940
|
+
const detector = new AdaptiveInterruptionDetector();
|
|
2941
|
+
|
|
2942
|
+
detector.on('overlapping_speech', this.onInterruptionOverlappingSpeech);
|
|
2943
|
+
detector.on('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2944
|
+
detector.on('error', this.onInterruptionError);
|
|
2945
|
+
|
|
2946
|
+
return detector;
|
|
2947
|
+
} catch (error: unknown) {
|
|
2948
|
+
this.logger.warn({ error }, 'could not instantiate AdaptiveInterruptionDetector');
|
|
2949
|
+
}
|
|
2950
|
+
return undefined;
|
|
2951
|
+
}
|
|
2952
|
+
|
|
2953
|
+
private restoreInterruptionByAudioActivity(): void {
|
|
2954
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2955
|
+
}
|
|
2956
|
+
|
|
2957
|
+
private fallbackToVadInterruption(): void {
|
|
2958
|
+
if (!this.isInterruptionDetectionEnabled) return;
|
|
2959
|
+
|
|
2960
|
+
this.isInterruptionDetectionEnabled = false;
|
|
2961
|
+
this.restoreInterruptionByAudioActivity();
|
|
2962
|
+
|
|
2963
|
+
if (this.interruptionDetector) {
|
|
2964
|
+
this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
|
|
2965
|
+
this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2966
|
+
this.interruptionDetector.off('error', this.onInterruptionError);
|
|
2967
|
+
this.interruptionDetector = undefined;
|
|
2968
|
+
}
|
|
2969
|
+
|
|
2970
|
+
if (this.audioRecognition) {
|
|
2971
|
+
this.audioRecognition.disableInterruptionDetection().catch((err) => {
|
|
2972
|
+
this.logger.warn({ err }, 'error while disabling interruption detection');
|
|
2973
|
+
});
|
|
2974
|
+
}
|
|
2975
|
+
|
|
2976
|
+
this.logger.warn(
|
|
2977
|
+
'adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption',
|
|
2978
|
+
);
|
|
2979
|
+
}
|
|
2980
|
+
|
|
2981
|
+
private async _closeSessionResources(): Promise<void> {
|
|
2982
|
+
// Unregister event handlers to prevent duplicate metrics
|
|
2983
|
+
if (this.llm instanceof LLM) {
|
|
2984
|
+
this.llm.off('metrics_collected', this.onMetricsCollected);
|
|
2985
|
+
this.llm.off('error', this.onModelError);
|
|
2986
|
+
}
|
|
2987
|
+
|
|
2988
|
+
if (this.realtimeSession) {
|
|
2989
|
+
this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
|
|
2990
|
+
this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
|
|
2991
|
+
this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
|
|
2992
|
+
this.realtimeSession.off(
|
|
2993
|
+
'input_audio_transcription_completed',
|
|
2994
|
+
this.onRealtimeInputAudioTranscriptionCompleted,
|
|
2995
|
+
);
|
|
2996
|
+
this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
|
|
2997
|
+
this.realtimeSession.off('error', this.onModelError);
|
|
2998
|
+
}
|
|
2999
|
+
|
|
3000
|
+
if (this.stt instanceof STT) {
|
|
3001
|
+
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
3002
|
+
this.stt.off('error', this.onModelError);
|
|
3003
|
+
}
|
|
3004
|
+
|
|
3005
|
+
if (this.tts instanceof TTS) {
|
|
3006
|
+
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
3007
|
+
this.tts.off('error', this.onModelError);
|
|
3008
|
+
}
|
|
3009
|
+
|
|
3010
|
+
if (this.vad instanceof VAD) {
|
|
3011
|
+
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
3012
|
+
}
|
|
3013
|
+
|
|
3014
|
+
this.detachAudioInput();
|
|
3015
|
+
this.realtimeSpans?.clear();
|
|
3016
|
+
await this.realtimeSession?.close();
|
|
3017
|
+
await this.audioRecognition?.close();
|
|
3018
|
+
this.realtimeSession = undefined;
|
|
3019
|
+
this.audioRecognition = undefined;
|
|
3020
|
+
}
|
|
1952
3021
|
}
|
|
1953
3022
|
|
|
1954
3023
|
function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {
|