@livekit/agents 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_exceptions.cjs.map +1 -1
- package/dist/_exceptions.d.ts.map +1 -1
- package/dist/_exceptions.js.map +1 -1
- package/dist/audio.cjs +89 -3
- package/dist/audio.cjs.map +1 -1
- package/dist/audio.d.cts +36 -1
- package/dist/audio.d.ts +36 -1
- package/dist/audio.d.ts.map +1 -1
- package/dist/audio.js +76 -2
- package/dist/audio.js.map +1 -1
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +165 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +141 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/cli.cjs +44 -46
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +45 -47
- package/dist/cli.js.map +1 -1
- package/dist/connection_pool.cjs +242 -0
- package/dist/connection_pool.cjs.map +1 -0
- package/dist/connection_pool.d.cts +123 -0
- package/dist/connection_pool.d.ts +123 -0
- package/dist/connection_pool.d.ts.map +1 -0
- package/dist/connection_pool.js +218 -0
- package/dist/connection_pool.js.map +1 -0
- package/dist/connection_pool.test.cjs +256 -0
- package/dist/connection_pool.test.cjs.map +1 -0
- package/dist/connection_pool.test.js +255 -0
- package/dist/connection_pool.test.js.map +1 -0
- package/dist/constants.cjs +30 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +10 -0
- package/dist/constants.d.ts +10 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +20 -0
- package/dist/constants.js.map +1 -1
- package/dist/cpu.cjs +189 -0
- package/dist/cpu.cjs.map +1 -0
- package/dist/cpu.d.cts +24 -0
- package/dist/cpu.d.ts +24 -0
- package/dist/cpu.d.ts.map +1 -0
- package/dist/cpu.js +152 -0
- package/dist/cpu.js.map +1 -0
- package/dist/cpu.test.cjs +227 -0
- package/dist/cpu.test.cjs.map +1 -0
- package/dist/cpu.test.js +204 -0
- package/dist/cpu.test.js.map +1 -0
- package/dist/http_server.cjs +9 -6
- package/dist/http_server.cjs.map +1 -1
- package/dist/http_server.d.cts +5 -1
- package/dist/http_server.d.ts +5 -1
- package/dist/http_server.d.ts.map +1 -1
- package/dist/http_server.js +9 -6
- package/dist/http_server.js.map +1 -1
- package/dist/index.cjs +24 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +15 -11
- package/dist/index.d.ts +15 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +18 -9
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.cjs +70 -2
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +373 -32
- package/dist/inference/api_protos.d.ts +373 -32
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +62 -2
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/index.cjs +8 -0
- package/dist/inference/index.cjs.map +1 -1
- package/dist/inference/index.d.cts +3 -4
- package/dist/inference/index.d.ts +3 -4
- package/dist/inference/index.d.ts.map +1 -1
- package/dist/inference/index.js +18 -3
- package/dist/inference/index.js.map +1 -1
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +163 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +65 -0
- package/dist/inference/interruption/http_transport.d.ts +65 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +137 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +198 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +164 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +347 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +313 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +106 -66
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +65 -43
- package/dist/inference/llm.d.ts +65 -43
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +100 -66
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +319 -170
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +64 -15
- package/dist/inference/stt.d.ts +64 -15
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +319 -170
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/stt.test.cjs +218 -0
- package/dist/inference/stt.test.cjs.map +1 -0
- package/dist/inference/stt.test.js +217 -0
- package/dist/inference/stt.test.js.map +1 -0
- package/dist/inference/tts.cjs +249 -71
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +94 -17
- package/dist/inference/tts.d.ts +94 -17
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +249 -77
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/tts.test.cjs +305 -0
- package/dist/inference/tts.test.cjs.map +1 -0
- package/dist/inference/tts.test.js +304 -0
- package/dist/inference/tts.test.js.map +1 -0
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +14 -1
- package/dist/inference/utils.d.ts +14 -1
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs +6 -3
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
- package/dist/ipc/inference_proc_executor.js +6 -3
- package/dist/ipc/inference_proc_executor.js.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.js +13 -1
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
- package/dist/ipc/job_proc_executor.cjs +6 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.d.ts.map +1 -1
- package/dist/ipc/job_proc_executor.js +6 -1
- package/dist/ipc/job_proc_executor.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +89 -17
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +68 -18
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.cjs +34 -8
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.d.cts +8 -0
- package/dist/ipc/supervised_proc.d.ts +8 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/ipc/supervised_proc.js +34 -8
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/ipc/supervised_proc.test.cjs +145 -0
- package/dist/ipc/supervised_proc.test.cjs.map +1 -0
- package/dist/ipc/supervised_proc.test.js +122 -0
- package/dist/ipc/supervised_proc.test.js.map +1 -0
- package/dist/job.cjs +109 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +14 -0
- package/dist/job.d.ts +14 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +99 -1
- package/dist/job.js.map +1 -1
- package/dist/language.cjs +394 -0
- package/dist/language.cjs.map +1 -0
- package/dist/language.d.cts +15 -0
- package/dist/language.d.ts +15 -0
- package/dist/language.d.ts.map +1 -0
- package/dist/language.js +363 -0
- package/dist/language.js.map +1 -0
- package/dist/language.test.cjs +43 -0
- package/dist/language.test.cjs.map +1 -0
- package/dist/language.test.js +49 -0
- package/dist/language.test.js.map +1 -0
- package/dist/llm/chat_context.cjs +345 -3
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +86 -2
- package/dist/llm/chat_context.d.ts +86 -2
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +344 -3
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +692 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +692 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/fallback_adapter.cjs +280 -0
- package/dist/llm/fallback_adapter.cjs.map +1 -0
- package/dist/llm/fallback_adapter.d.cts +73 -0
- package/dist/llm/fallback_adapter.d.ts +73 -0
- package/dist/llm/fallback_adapter.d.ts.map +1 -0
- package/dist/llm/fallback_adapter.js +256 -0
- package/dist/llm/fallback_adapter.js.map +1 -0
- package/dist/llm/fallback_adapter.test.cjs +176 -0
- package/dist/llm/fallback_adapter.test.cjs.map +1 -0
- package/dist/llm/fallback_adapter.test.js +175 -0
- package/dist/llm/fallback_adapter.test.js.map +1 -0
- package/dist/llm/index.cjs +11 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +4 -3
- package/dist/llm/index.d.ts +4 -3
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +13 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +65 -11
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +13 -2
- package/dist/llm/llm.d.ts +13 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +65 -11
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/google.cjs +6 -2
- package/dist/llm/provider_format/google.cjs.map +1 -1
- package/dist/llm/provider_format/google.d.cts +1 -1
- package/dist/llm/provider_format/google.d.ts +1 -1
- package/dist/llm/provider_format/google.d.ts.map +1 -1
- package/dist/llm/provider_format/google.js +6 -2
- package/dist/llm/provider_format/google.js.map +1 -1
- package/dist/llm/provider_format/google.test.cjs +48 -0
- package/dist/llm/provider_format/google.test.cjs.map +1 -1
- package/dist/llm/provider_format/google.test.js +54 -1
- package/dist/llm/provider_format/google.test.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +2 -2
- package/dist/llm/provider_format/index.d.ts +2 -2
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +126 -24
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +124 -23
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +393 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +400 -2
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +5 -4
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +5 -4
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +15 -1
- package/dist/llm/realtime.d.ts +15 -1
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/remote_chat_context.cjs.map +1 -1
- package/dist/llm/remote_chat_context.d.cts +2 -0
- package/dist/llm/remote_chat_context.d.ts +2 -0
- package/dist/llm/remote_chat_context.d.ts.map +1 -1
- package/dist/llm/remote_chat_context.js.map +1 -1
- package/dist/llm/tool_context.cjs +50 -2
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +47 -11
- package/dist/llm/tool_context.d.ts +47 -11
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +48 -3
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/llm/tool_context.test.cjs +197 -0
- package/dist/llm/tool_context.test.cjs.map +1 -1
- package/dist/llm/tool_context.test.js +175 -0
- package/dist/llm/tool_context.test.js.map +1 -1
- package/dist/llm/utils.cjs +107 -12
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +10 -3
- package/dist/llm/utils.d.ts +10 -3
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +106 -12
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/utils.test.cjs +90 -0
- package/dist/llm/utils.test.cjs.map +1 -1
- package/dist/llm/utils.test.js +98 -2
- package/dist/llm/utils.test.js.map +1 -1
- package/dist/llm/zod-utils.cjs +102 -0
- package/dist/llm/zod-utils.cjs.map +1 -0
- package/dist/llm/zod-utils.d.cts +65 -0
- package/dist/llm/zod-utils.d.ts +65 -0
- package/dist/llm/zod-utils.d.ts.map +1 -0
- package/dist/llm/zod-utils.js +64 -0
- package/dist/llm/zod-utils.js.map +1 -0
- package/dist/llm/zod-utils.test.cjs +472 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -0
- package/dist/llm/zod-utils.test.js +455 -0
- package/dist/llm/zod-utils.test.js.map +1 -0
- package/dist/log.cjs +45 -14
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.cts +8 -1
- package/dist/log.d.ts +8 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +45 -15
- package/dist/log.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +75 -19
- package/dist/metrics/base.d.ts +75 -19
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +5 -2
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +10 -1
- package/dist/metrics/usage_collector.d.ts +10 -1
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +5 -2
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +23 -7
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +23 -7
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +31 -10
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.cts +6 -1
- package/dist/stream/deferred_stream.d.ts +6 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +31 -10
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/deferred_stream.test.cjs +2 -2
- package/dist/stream/deferred_stream.test.cjs.map +1 -1
- package/dist/stream/deferred_stream.test.js +2 -2
- package/dist/stream/deferred_stream.test.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +344 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +343 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/stream/stream_channel.cjs +39 -1
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +5 -2
- package/dist/stream/stream_channel.d.ts +5 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +39 -1
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stream/stream_channel.test.cjs +27 -0
- package/dist/stream/stream_channel.test.cjs.map +1 -1
- package/dist/stream/stream_channel.test.js +27 -0
- package/dist/stream/stream_channel.test.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +24 -9
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.cts +7 -3
- package/dist/stt/stream_adapter.d.ts +7 -3
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +24 -9
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +94 -19
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +68 -5
- package/dist/stt/stt.d.ts +68 -5
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +96 -21
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/index.cjs +72 -0
- package/dist/telemetry/index.cjs.map +1 -0
- package/dist/telemetry/index.d.cts +7 -0
- package/dist/telemetry/index.d.ts +7 -0
- package/dist/telemetry/index.d.ts.map +1 -0
- package/dist/telemetry/index.js +37 -0
- package/dist/telemetry/index.js.map +1 -0
- package/dist/telemetry/logging.cjs +65 -0
- package/dist/telemetry/logging.cjs.map +1 -0
- package/dist/telemetry/logging.d.cts +21 -0
- package/dist/telemetry/logging.d.ts +21 -0
- package/dist/telemetry/logging.d.ts.map +1 -0
- package/dist/telemetry/logging.js +40 -0
- package/dist/telemetry/logging.js.map +1 -0
- package/dist/telemetry/otel_http_exporter.cjs +166 -0
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
- package/dist/telemetry/otel_http_exporter.d.cts +63 -0
- package/dist/telemetry/otel_http_exporter.d.ts +63 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
- package/dist/telemetry/otel_http_exporter.js +142 -0
- package/dist/telemetry/otel_http_exporter.js.map +1 -0
- package/dist/telemetry/pino_otel_transport.cjs +217 -0
- package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
- package/dist/telemetry/pino_otel_transport.d.cts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
- package/dist/telemetry/pino_otel_transport.js +189 -0
- package/dist/telemetry/pino_otel_transport.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +233 -0
- package/dist/telemetry/trace_types.cjs.map +1 -0
- package/dist/telemetry/trace_types.d.cts +74 -0
- package/dist/telemetry/trace_types.d.ts +74 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -0
- package/dist/telemetry/trace_types.js +141 -0
- package/dist/telemetry/trace_types.js.map +1 -0
- package/dist/telemetry/traces.cjs +484 -0
- package/dist/telemetry/traces.cjs.map +1 -0
- package/dist/telemetry/traces.d.cts +116 -0
- package/dist/telemetry/traces.d.ts +116 -0
- package/dist/telemetry/traces.d.ts.map +1 -0
- package/dist/telemetry/traces.js +449 -0
- package/dist/telemetry/traces.js.map +1 -0
- package/dist/telemetry/utils.cjs +86 -0
- package/dist/telemetry/utils.cjs.map +1 -0
- package/dist/telemetry/utils.d.cts +5 -0
- package/dist/telemetry/utils.d.ts +5 -0
- package/dist/telemetry/utils.d.ts.map +1 -0
- package/dist/telemetry/utils.js +51 -0
- package/dist/telemetry/utils.js.map +1 -0
- package/dist/tokenize/basic/sentence.cjs +3 -3
- package/dist/tokenize/basic/sentence.cjs.map +1 -1
- package/dist/tokenize/basic/sentence.js +3 -3
- package/dist/tokenize/basic/sentence.js.map +1 -1
- package/dist/tokenize/tokenizer.test.cjs +3 -1
- package/dist/tokenize/tokenizer.test.cjs.map +1 -1
- package/dist/tokenize/tokenizer.test.js +3 -1
- package/dist/tokenize/tokenizer.test.js.map +1 -1
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.cts +6 -0
- package/dist/transcription.d.ts +6 -0
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js.map +1 -1
- package/dist/tts/fallback_adapter.cjs +472 -0
- package/dist/tts/fallback_adapter.cjs.map +1 -0
- package/dist/tts/fallback_adapter.d.cts +110 -0
- package/dist/tts/fallback_adapter.d.ts +110 -0
- package/dist/tts/fallback_adapter.d.ts.map +1 -0
- package/dist/tts/fallback_adapter.js +448 -0
- package/dist/tts/fallback_adapter.js.map +1 -0
- package/dist/tts/index.cjs +3 -0
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +1 -0
- package/dist/tts/index.d.ts +1 -0
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +2 -0
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +25 -8
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +6 -3
- package/dist/tts/stream_adapter.d.ts +6 -3
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +25 -8
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +189 -57
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +58 -6
- package/dist/tts/tts.d.ts +58 -6
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +191 -59
- package/dist/tts/tts.js.map +1 -1
- package/dist/typed_promise.cjs +48 -0
- package/dist/typed_promise.cjs.map +1 -0
- package/dist/typed_promise.d.cts +24 -0
- package/dist/typed_promise.d.ts +24 -0
- package/dist/typed_promise.d.ts.map +1 -0
- package/dist/typed_promise.js +28 -0
- package/dist/typed_promise.js.map +1 -0
- package/dist/types.cjs +24 -32
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +45 -10
- package/dist/types.d.ts +45 -10
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +20 -30
- package/dist/types.js.map +1 -1
- package/dist/utils.cjs +124 -28
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +41 -1
- package/dist/utils.d.ts +41 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +119 -27
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +73 -1
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +74 -10
- package/dist/utils.test.js.map +1 -1
- package/dist/vad.cjs +35 -15
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +15 -5
- package/dist/vad.d.ts +15 -5
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +35 -15
- package/dist/vad.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +258 -35
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +54 -13
- package/dist/voice/agent.d.ts +54 -13
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +254 -34
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +314 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +316 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +1116 -385
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +72 -11
- package/dist/voice/agent_activity.d.ts +72 -11
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +1119 -383
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_activity.test.cjs +135 -0
- package/dist/voice/agent_activity.test.cjs.map +1 -0
- package/dist/voice/agent_activity.test.js +134 -0
- package/dist/voice/agent_activity.test.js.map +1 -0
- package/dist/voice/agent_session.cjs +550 -90
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +185 -25
- package/dist/voice/agent_session.d.ts +185 -25
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +556 -91
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +605 -46
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +96 -4
- package/dist/voice/audio_recognition.d.ts +96 -4
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +611 -47
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +295 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +299 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/avatar/datastream_io.cjs +7 -1
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +7 -1
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs +367 -0
- package/dist/voice/background_audio.cjs.map +1 -0
- package/dist/voice/background_audio.d.cts +123 -0
- package/dist/voice/background_audio.d.ts +123 -0
- package/dist/voice/background_audio.d.ts.map +1 -0
- package/dist/voice/background_audio.js +343 -0
- package/dist/voice/background_audio.js.map +1 -0
- package/dist/voice/events.cjs +3 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +16 -9
- package/dist/voice/events.d.ts +16 -9
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +3 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +205 -41
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +21 -5
- package/dist/voice/generation.d.ts +21 -5
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +215 -43
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/generation_tools.test.cjs +236 -0
- package/dist/voice/generation_tools.test.cjs.map +1 -0
- package/dist/voice/generation_tools.test.js +235 -0
- package/dist/voice/generation_tools.test.js.map +1 -0
- package/dist/voice/index.cjs +33 -2
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +8 -2
- package/dist/voice/index.d.ts +8 -2
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +19 -2
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/interruption_detection.test.cjs +114 -0
- package/dist/voice/interruption_detection.test.cjs.map +1 -0
- package/dist/voice/interruption_detection.test.js +113 -0
- package/dist/voice/interruption_detection.test.js.map +1 -0
- package/dist/voice/io.cjs +66 -6
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +67 -7
- package/dist/voice/io.d.ts +67 -7
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +62 -5
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/index.cjs +23 -0
- package/dist/voice/recorder_io/index.cjs.map +1 -0
- package/dist/voice/recorder_io/index.d.cts +2 -0
- package/dist/voice/recorder_io/index.d.ts +2 -0
- package/dist/voice/recorder_io/index.d.ts.map +1 -0
- package/dist/voice/recorder_io/index.js +2 -0
- package/dist/voice/recorder_io/index.js.map +1 -0
- package/dist/voice/recorder_io/recorder_io.cjs +607 -0
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
- package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
- package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
- package/dist/voice/recorder_io/recorder_io.js +573 -0
- package/dist/voice/recorder_io/recorder_io.js.map +1 -0
- package/dist/voice/remote_session.cjs +922 -0
- package/dist/voice/remote_session.cjs.map +1 -0
- package/dist/voice/remote_session.d.cts +108 -0
- package/dist/voice/remote_session.d.ts +108 -0
- package/dist/voice/remote_session.d.ts.map +1 -0
- package/dist/voice/remote_session.js +887 -0
- package/dist/voice/remote_session.js.map +1 -0
- package/dist/voice/report.cjs +88 -0
- package/dist/voice/report.cjs.map +1 -0
- package/dist/voice/report.d.cts +49 -0
- package/dist/voice/report.d.ts +49 -0
- package/dist/voice/report.d.ts.map +1 -0
- package/dist/voice/report.js +63 -0
- package/dist/voice/report.js.map +1 -0
- package/dist/voice/report.test.cjs +121 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +120 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/_input.cjs +40 -7
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +5 -2
- package/dist/voice/room_io/_input.d.ts +5 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +41 -8
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +19 -11
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +7 -4
- package/dist/voice/room_io/_output.d.ts +7 -4
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +20 -12
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +33 -6
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +29 -9
- package/dist/voice/room_io/room_io.d.ts +29 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +33 -7
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +22 -4
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +17 -2
- package/dist/voice/speech_handle.d.ts +17 -2
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +21 -4
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +57 -0
- package/dist/voice/testing/index.cjs.map +1 -0
- package/dist/voice/testing/index.d.cts +21 -0
- package/dist/voice/testing/index.d.ts +21 -0
- package/dist/voice/testing/index.d.ts.map +1 -0
- package/dist/voice/testing/index.js +35 -0
- package/dist/voice/testing/index.js.map +1 -0
- package/dist/voice/testing/run_result.cjs +817 -0
- package/dist/voice/testing/run_result.cjs.map +1 -0
- package/dist/voice/testing/run_result.d.cts +385 -0
- package/dist/voice/testing/run_result.d.ts +385 -0
- package/dist/voice/testing/run_result.d.ts.map +1 -0
- package/dist/voice/testing/run_result.js +790 -0
- package/dist/voice/testing/run_result.js.map +1 -0
- package/dist/voice/testing/types.cjs +46 -0
- package/dist/voice/testing/types.cjs.map +1 -0
- package/dist/voice/testing/types.d.cts +83 -0
- package/dist/voice/testing/types.d.ts +83 -0
- package/dist/voice/testing/types.d.ts.map +1 -0
- package/dist/voice/testing/types.js +19 -0
- package/dist/voice/testing/types.js.map +1 -0
- package/dist/voice/transcription/synchronizer.cjs +139 -15
- package/dist/voice/transcription/synchronizer.cjs.map +1 -1
- package/dist/voice/transcription/synchronizer.d.cts +35 -4
- package/dist/voice/transcription/synchronizer.d.ts +35 -4
- package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
- package/dist/voice/transcription/synchronizer.js +143 -16
- package/dist/voice/transcription/synchronizer.js.map +1 -1
- package/dist/voice/transcription/synchronizer.test.cjs +151 -0
- package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
- package/dist/voice/transcription/synchronizer.test.js +150 -0
- package/dist/voice/transcription/synchronizer.test.js.map +1 -0
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +157 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +37 -0
- package/dist/voice/turn_config/utils.d.ts +37 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +131 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +128 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +127 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/dist/worker.cjs +44 -52
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +18 -8
- package/dist/worker.d.ts +18 -8
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +43 -43
- package/dist/worker.js.map +1 -1
- package/package.json +35 -13
- package/resources/NOTICE +2 -0
- package/resources/keyboard-typing.ogg +0 -0
- package/resources/keyboard-typing2.ogg +0 -0
- package/resources/office-ambience.ogg +0 -0
- package/src/_exceptions.ts +5 -0
- package/src/audio.ts +132 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +203 -0
- package/src/cli.ts +57 -66
- package/src/connection_pool.test.ts +346 -0
- package/src/connection_pool.ts +307 -0
- package/src/constants.ts +14 -0
- package/src/cpu.test.ts +239 -0
- package/src/cpu.ts +173 -0
- package/src/http_server.ts +18 -6
- package/src/index.ts +15 -13
- package/src/inference/api_protos.ts +85 -2
- package/src/inference/index.ts +32 -4
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +207 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +204 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +416 -0
- package/src/inference/llm.ts +214 -163
- package/src/inference/stt.test.ts +253 -0
- package/src/inference/stt.ts +449 -208
- package/src/inference/tts.test.ts +354 -0
- package/src/inference/tts.ts +417 -115
- package/src/inference/utils.ts +30 -2
- package/src/ipc/inference_proc_executor.ts +11 -3
- package/src/ipc/inference_proc_lazy_main.ts +13 -1
- package/src/ipc/job_proc_executor.ts +11 -1
- package/src/ipc/job_proc_lazy_main.ts +86 -20
- package/src/ipc/supervised_proc.test.ts +153 -0
- package/src/ipc/supervised_proc.ts +39 -10
- package/src/job.ts +120 -1
- package/src/language.test.ts +62 -0
- package/src/language.ts +380 -0
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
- package/src/llm/chat_context.test.ts +787 -0
- package/src/llm/chat_context.ts +493 -2
- package/src/llm/fallback_adapter.test.ts +238 -0
- package/src/llm/fallback_adapter.ts +394 -0
- package/src/llm/index.ts +13 -0
- package/src/llm/llm.ts +77 -12
- package/src/llm/provider_format/google.test.ts +72 -1
- package/src/llm/provider_format/google.ts +10 -6
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +480 -2
- package/src/llm/provider_format/openai.ts +152 -21
- package/src/llm/provider_format/utils.ts +11 -5
- package/src/llm/realtime.ts +23 -2
- package/src/llm/remote_chat_context.ts +2 -2
- package/src/llm/tool_context.test.ts +210 -1
- package/src/llm/tool_context.ts +115 -17
- package/src/llm/utils.test.ts +103 -2
- package/src/llm/utils.ts +152 -16
- package/src/llm/zod-utils.test.ts +577 -0
- package/src/llm/zod-utils.ts +153 -0
- package/src/log.ts +71 -19
- package/src/metrics/base.ts +78 -19
- package/src/metrics/index.ts +12 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +14 -3
- package/src/metrics/utils.ts +27 -7
- package/src/stream/deferred_stream.test.ts +3 -3
- package/src/stream/deferred_stream.ts +43 -11
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +545 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/stream/stream_channel.test.ts +37 -0
- package/src/stream/stream_channel.ts +43 -3
- package/src/stt/stream_adapter.ts +30 -9
- package/src/stt/stt.ts +140 -23
- package/src/telemetry/index.ts +28 -0
- package/src/telemetry/logging.ts +55 -0
- package/src/telemetry/otel_http_exporter.ts +218 -0
- package/src/telemetry/pino_otel_transport.ts +265 -0
- package/src/telemetry/trace_types.ts +109 -0
- package/src/telemetry/traces.ts +673 -0
- package/src/telemetry/utils.ts +61 -0
- package/src/tokenize/basic/sentence.ts +3 -3
- package/src/tokenize/tokenizer.test.ts +4 -0
- package/src/transcription.ts +6 -0
- package/src/tts/fallback_adapter.ts +586 -0
- package/src/tts/index.ts +1 -0
- package/src/tts/stream_adapter.ts +38 -8
- package/src/tts/tts.ts +245 -62
- package/src/typed_promise.ts +67 -0
- package/src/types.ts +62 -33
- package/src/utils.test.ts +90 -10
- package/src/utils.ts +178 -33
- package/src/vad.ts +42 -18
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +347 -2
- package/src/voice/agent.ts +346 -44
- package/src/voice/agent_activity.test.ts +194 -0
- package/src/voice/agent_activity.ts +1457 -388
- package/src/voice/agent_session.ts +817 -112
- package/src/voice/audio_recognition.ts +845 -70
- package/src/voice/audio_recognition_span.test.ts +341 -0
- package/src/voice/avatar/datastream_io.ts +9 -1
- package/src/voice/background_audio.ts +494 -0
- package/src/voice/events.ts +27 -7
- package/src/voice/generation.ts +310 -56
- package/src/voice/generation_tools.test.ts +268 -0
- package/src/voice/index.ts +17 -3
- package/src/voice/interruption_detection.test.ts +151 -0
- package/src/voice/io.ts +115 -12
- package/src/voice/recorder_io/index.ts +4 -0
- package/src/voice/recorder_io/recorder_io.ts +783 -0
- package/src/voice/remote_session.ts +1083 -0
- package/src/voice/report.test.ts +136 -0
- package/src/voice/report.ts +140 -0
- package/src/voice/room_io/_input.ts +45 -10
- package/src/voice/room_io/_output.ts +26 -14
- package/src/voice/room_io/room_io.ts +67 -22
- package/src/voice/speech_handle.ts +38 -6
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +52 -0
- package/src/voice/testing/run_result.ts +995 -0
- package/src/voice/testing/types.ts +118 -0
- package/src/voice/transcription/synchronizer.test.ts +206 -0
- package/src/voice/transcription/synchronizer.ts +204 -19
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +148 -0
- package/src/voice/turn_config/utils.ts +167 -0
- package/src/voice/utils.ts +29 -0
- package/src/worker.ts +92 -78
- package/src/llm/__snapshots__/utils.test.ts.snap +0 -65
|
@@ -1,20 +1,31 @@
|
|
|
1
1
|
import { Mutex } from "@livekit/mutex";
|
|
2
|
+
import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
|
|
2
3
|
import { Heap } from "heap-js";
|
|
3
4
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
4
|
-
import { ReadableStream } from "node:stream/web";
|
|
5
|
+
import { ReadableStream, TransformStream } from "node:stream/web";
|
|
6
|
+
import { AdaptiveInterruptionDetector } from "../inference/interruption/interruption_detector.js";
|
|
5
7
|
import { ChatMessage } from "../llm/chat_context.js";
|
|
6
8
|
import {
|
|
7
9
|
LLM,
|
|
8
|
-
RealtimeModel
|
|
10
|
+
RealtimeModel,
|
|
11
|
+
ToolFlag
|
|
9
12
|
} from "../llm/index.js";
|
|
13
|
+
import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
|
|
10
14
|
import { log } from "../log.js";
|
|
11
|
-
import {
|
|
15
|
+
import { MultiInputStream } from "../stream/multi_input_stream.js";
|
|
12
16
|
import { STT } from "../stt/stt.js";
|
|
17
|
+
import { recordRealtimeMetrics, traceTypes, tracer } from "../telemetry/index.js";
|
|
13
18
|
import { splitWords } from "../tokenize/basic/word.js";
|
|
14
19
|
import { TTS } from "../tts/tts.js";
|
|
15
|
-
import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
|
|
20
|
+
import { Future, Task, cancelAndWait, isDevMode, isHosted, waitFor } from "../utils.js";
|
|
16
21
|
import { VAD } from "../vad.js";
|
|
17
|
-
import {
|
|
22
|
+
import {
|
|
23
|
+
StopResponse,
|
|
24
|
+
_getActivityTaskInfo,
|
|
25
|
+
_setActivityTaskInfo,
|
|
26
|
+
functionCallStorage,
|
|
27
|
+
speechHandleStorage
|
|
28
|
+
} from "./agent.js";
|
|
18
29
|
import {} from "./agent_session.js";
|
|
19
30
|
import {
|
|
20
31
|
AudioRecognition
|
|
@@ -37,28 +48,66 @@ import {
|
|
|
37
48
|
updateInstructions
|
|
38
49
|
} from "./generation.js";
|
|
39
50
|
import { SpeechHandle } from "./speech_handle.js";
|
|
40
|
-
|
|
51
|
+
import { setParticipantSpanAttributes } from "./utils.js";
|
|
52
|
+
const agentActivityStorage = new AsyncLocalStorage();
|
|
53
|
+
const onEnterStorage = new AsyncLocalStorage();
|
|
41
54
|
class AgentActivity {
|
|
55
|
+
agent;
|
|
56
|
+
agentSession;
|
|
42
57
|
static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
|
|
43
58
|
started = false;
|
|
44
59
|
audioRecognition;
|
|
45
60
|
realtimeSession;
|
|
61
|
+
realtimeSpans;
|
|
62
|
+
// Maps response_id to OTEL span for metrics recording
|
|
46
63
|
turnDetectionMode;
|
|
47
64
|
logger = log();
|
|
48
|
-
|
|
65
|
+
_schedulingPaused = true;
|
|
66
|
+
_drainBlockedTasks = [];
|
|
49
67
|
_currentSpeech;
|
|
50
68
|
speechQueue;
|
|
51
69
|
// [priority, timestamp, speechHandle]
|
|
52
70
|
q_updated;
|
|
53
71
|
speechTasks = /* @__PURE__ */ new Set();
|
|
54
72
|
lock = new Mutex();
|
|
55
|
-
audioStream = new
|
|
73
|
+
audioStream = new MultiInputStream();
|
|
74
|
+
audioStreamId;
|
|
56
75
|
// default to null as None, which maps to the default provider tool choice value
|
|
57
76
|
toolChoice = null;
|
|
58
|
-
|
|
59
|
-
|
|
77
|
+
_preemptiveGeneration;
|
|
78
|
+
interruptionDetector;
|
|
79
|
+
isInterruptionDetectionEnabled;
|
|
80
|
+
isInterruptionByAudioActivityEnabled;
|
|
81
|
+
isDefaultInterruptionByAudioActivityEnabled;
|
|
82
|
+
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
83
|
+
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
84
|
+
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
85
|
+
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
86
|
+
onModelError = (ev) => this.onError(ev);
|
|
87
|
+
onInterruptionOverlappingSpeech = (ev) => {
|
|
88
|
+
this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev);
|
|
89
|
+
};
|
|
90
|
+
onInterruptionMetricsCollected = (ev) => {
|
|
91
|
+
this.agentSession._usageCollector.collect(ev);
|
|
92
|
+
this.agentSession.emit(
|
|
93
|
+
AgentSessionEventTypes.MetricsCollected,
|
|
94
|
+
createMetricsCollectedEvent({ metrics: ev })
|
|
95
|
+
);
|
|
96
|
+
};
|
|
97
|
+
onInterruptionError = (ev) => {
|
|
98
|
+
const errorEvent = createErrorEvent(ev, this.interruptionDetector);
|
|
99
|
+
this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
|
|
100
|
+
if (!ev.recoverable) {
|
|
101
|
+
this.agentSession._onError(ev);
|
|
102
|
+
this.fallbackToVadInterruption();
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
105
|
+
this.agentSession._onError(ev);
|
|
106
|
+
};
|
|
60
107
|
/** @internal */
|
|
61
108
|
_mainTask;
|
|
109
|
+
_onEnterTask;
|
|
110
|
+
_onExitTask;
|
|
62
111
|
_userTurnCompletedTask;
|
|
63
112
|
constructor(agent, agentSession) {
|
|
64
113
|
this.agent = agent;
|
|
@@ -70,7 +119,7 @@ class AgentActivity {
|
|
|
70
119
|
this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
|
|
71
120
|
if (this.turnDetectionMode === "vad" && this.vad === void 0) {
|
|
72
121
|
this.logger.warn(
|
|
73
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
122
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
|
|
74
123
|
);
|
|
75
124
|
this.turnDetectionMode = void 0;
|
|
76
125
|
}
|
|
@@ -113,89 +162,136 @@ class AgentActivity {
|
|
|
113
162
|
);
|
|
114
163
|
this.turnDetectionMode = void 0;
|
|
115
164
|
}
|
|
116
|
-
if (!this.vad && this.stt && this.llm instanceof LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
|
|
165
|
+
if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
|
|
117
166
|
this.logger.warn(
|
|
118
|
-
"VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
|
|
167
|
+
"VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
|
|
119
168
|
);
|
|
120
169
|
}
|
|
170
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
171
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
172
|
+
this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
173
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
121
174
|
}
|
|
122
175
|
async start() {
|
|
123
176
|
const unlock = await this.lock.lock();
|
|
124
177
|
try {
|
|
125
|
-
this.
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
178
|
+
await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
|
|
179
|
+
} finally {
|
|
180
|
+
unlock();
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
async resume() {
|
|
184
|
+
const unlock = await this.lock.lock();
|
|
185
|
+
try {
|
|
186
|
+
await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
|
|
187
|
+
} finally {
|
|
188
|
+
unlock();
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
async _startSession(options) {
|
|
192
|
+
var _a, _b, _c, _d, _e;
|
|
193
|
+
const { spanName, runOnEnter } = options;
|
|
194
|
+
const startSpan = tracer.startSpan({
|
|
195
|
+
name: spanName,
|
|
196
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
197
|
+
context: ROOT_CONTEXT
|
|
198
|
+
});
|
|
199
|
+
this.agent._agentActivity = this;
|
|
200
|
+
if (this.llm instanceof RealtimeModel) {
|
|
201
|
+
this.realtimeSession = this.llm.session();
|
|
202
|
+
this.realtimeSpans = /* @__PURE__ */ new Map();
|
|
203
|
+
this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
|
|
204
|
+
this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
205
|
+
this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
206
|
+
this.realtimeSession.on(
|
|
207
|
+
"input_audio_transcription_completed",
|
|
208
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
209
|
+
);
|
|
210
|
+
this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
|
|
211
|
+
this.realtimeSession.on("error", this.onModelError);
|
|
212
|
+
removeInstructions(this.agent._chatCtx);
|
|
213
|
+
try {
|
|
214
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
215
|
+
} catch (error) {
|
|
216
|
+
this.logger.error(error, "failed to update the instructions");
|
|
217
|
+
}
|
|
218
|
+
try {
|
|
219
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
220
|
+
} catch (error) {
|
|
221
|
+
this.logger.error(error, "failed to update the chat context");
|
|
222
|
+
}
|
|
223
|
+
try {
|
|
224
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
225
|
+
} catch (error) {
|
|
226
|
+
this.logger.error(error, "failed to update the tools");
|
|
227
|
+
}
|
|
228
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
229
|
+
this.logger.error(
|
|
230
|
+
"audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
|
|
134
231
|
);
|
|
135
|
-
this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
|
|
136
|
-
this.realtimeSession.on("error", (ev) => this.onError(ev));
|
|
137
|
-
removeInstructions(this.agent._chatCtx);
|
|
138
|
-
try {
|
|
139
|
-
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
140
|
-
} catch (error) {
|
|
141
|
-
this.logger.error(error, "failed to update the instructions");
|
|
142
|
-
}
|
|
143
|
-
try {
|
|
144
|
-
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
145
|
-
} catch (error) {
|
|
146
|
-
this.logger.error(error, "failed to update the chat context");
|
|
147
|
-
}
|
|
148
|
-
try {
|
|
149
|
-
await this.realtimeSession.updateTools(this.tools);
|
|
150
|
-
} catch (error) {
|
|
151
|
-
this.logger.error(error, "failed to update the tools");
|
|
152
|
-
}
|
|
153
|
-
} else if (this.llm instanceof LLM) {
|
|
154
|
-
try {
|
|
155
|
-
updateInstructions({
|
|
156
|
-
chatCtx: this.agent._chatCtx,
|
|
157
|
-
instructions: this.agent.instructions,
|
|
158
|
-
addIfMissing: true
|
|
159
|
-
});
|
|
160
|
-
} catch (error) {
|
|
161
|
-
this.logger.error("failed to update the instructions", error);
|
|
162
|
-
}
|
|
163
232
|
}
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
233
|
+
} else if (this.llm instanceof LLM) {
|
|
234
|
+
try {
|
|
235
|
+
updateInstructions({
|
|
236
|
+
chatCtx: this.agent._chatCtx,
|
|
237
|
+
instructions: this.agent.instructions,
|
|
238
|
+
addIfMissing: true
|
|
239
|
+
});
|
|
240
|
+
} catch (error) {
|
|
241
|
+
this.logger.error("failed to update the instructions", error);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
if (this.llm instanceof LLM) {
|
|
245
|
+
this.llm.on("metrics_collected", this.onMetricsCollected);
|
|
246
|
+
this.llm.on("error", this.onModelError);
|
|
247
|
+
}
|
|
248
|
+
if (this.stt instanceof STT) {
|
|
249
|
+
this.stt.on("metrics_collected", this.onMetricsCollected);
|
|
250
|
+
this.stt.on("error", this.onModelError);
|
|
251
|
+
}
|
|
252
|
+
if (this.tts instanceof TTS) {
|
|
253
|
+
this.tts.on("metrics_collected", this.onMetricsCollected);
|
|
254
|
+
this.tts.on("error", this.onModelError);
|
|
255
|
+
}
|
|
256
|
+
if (this.vad instanceof VAD) {
|
|
257
|
+
this.vad.on("metrics_collected", this.onMetricsCollected);
|
|
258
|
+
}
|
|
259
|
+
this.audioRecognition = new AudioRecognition({
|
|
260
|
+
recognitionHooks: this,
|
|
261
|
+
// Disable stt node if stt is not provided
|
|
262
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
|
|
263
|
+
vad: this.vad,
|
|
264
|
+
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
265
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
266
|
+
interruptionDetection: this.interruptionDetector,
|
|
267
|
+
minEndpointingDelay: ((_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.endpointing) == null ? void 0 : _b.minDelay) ?? this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
|
|
268
|
+
maxEndpointingDelay: ((_d = (_c = this.agent.turnHandling) == null ? void 0 : _c.endpointing) == null ? void 0 : _d.maxDelay) ?? this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
|
|
269
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
270
|
+
sttModel: (_e = this.stt) == null ? void 0 : _e.label,
|
|
271
|
+
sttProvider: this.getSttProvider(),
|
|
272
|
+
getLinkedParticipant: () => {
|
|
273
|
+
var _a2;
|
|
274
|
+
return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
|
|
275
|
+
}
|
|
276
|
+
});
|
|
277
|
+
this.audioRecognition.start();
|
|
278
|
+
this.started = true;
|
|
279
|
+
this._resumeSchedulingTask();
|
|
280
|
+
if (runOnEnter) {
|
|
281
|
+
this._onEnterTask = this.createSpeechTask({
|
|
282
|
+
taskFn: () => onEnterStorage.run(
|
|
283
|
+
{ session: this.agentSession, agent: this.agent },
|
|
284
|
+
() => tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
285
|
+
name: "on_enter",
|
|
286
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
287
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
288
|
+
})
|
|
289
|
+
),
|
|
290
|
+
inlineTask: true,
|
|
194
291
|
name: "AgentActivity_onEnter"
|
|
195
292
|
});
|
|
196
|
-
} finally {
|
|
197
|
-
unlock();
|
|
198
293
|
}
|
|
294
|
+
startSpan.end();
|
|
199
295
|
}
|
|
200
296
|
get currentSpeech() {
|
|
201
297
|
return this._currentSpeech;
|
|
@@ -206,6 +302,15 @@ class AgentActivity {
|
|
|
206
302
|
get stt() {
|
|
207
303
|
return this.agent.stt || this.agentSession.stt;
|
|
208
304
|
}
|
|
305
|
+
getSttProvider() {
|
|
306
|
+
var _a;
|
|
307
|
+
const label = (_a = this.stt) == null ? void 0 : _a.label;
|
|
308
|
+
if (!label) {
|
|
309
|
+
return void 0;
|
|
310
|
+
}
|
|
311
|
+
const [provider] = label.split("-", 1);
|
|
312
|
+
return provider || label;
|
|
313
|
+
}
|
|
209
314
|
get llm() {
|
|
210
315
|
return this.agent.llm || this.agentSession.llm;
|
|
211
316
|
}
|
|
@@ -215,21 +320,46 @@ class AgentActivity {
|
|
|
215
320
|
get tools() {
|
|
216
321
|
return this.agent.toolCtx;
|
|
217
322
|
}
|
|
218
|
-
get
|
|
219
|
-
return this.
|
|
323
|
+
get schedulingPaused() {
|
|
324
|
+
return this._schedulingPaused;
|
|
220
325
|
}
|
|
221
326
|
get realtimeLLMSession() {
|
|
222
327
|
return this.realtimeSession;
|
|
223
328
|
}
|
|
224
329
|
get allowInterruptions() {
|
|
225
|
-
|
|
330
|
+
var _a, _b;
|
|
331
|
+
return ((_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.interruption) == null ? void 0 : _b.enabled) ?? this.agentSession.sessionOptions.turnHandling.interruption.enabled;
|
|
332
|
+
}
|
|
333
|
+
get useTtsAlignedTranscript() {
|
|
334
|
+
return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
|
|
226
335
|
}
|
|
227
336
|
get turnDetection() {
|
|
228
|
-
|
|
337
|
+
var _a;
|
|
338
|
+
return ((_a = this.agent.turnHandling) == null ? void 0 : _a.turnDetection) ?? this.agentSession.turnDetection;
|
|
339
|
+
}
|
|
340
|
+
get turnHandling() {
|
|
341
|
+
return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
|
|
229
342
|
}
|
|
343
|
+
// get minEndpointingDelay(): number {
|
|
344
|
+
// return (
|
|
345
|
+
// this.agent.turnHandling?.endpointing?.minDelay ??
|
|
346
|
+
// this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
|
|
347
|
+
// );
|
|
348
|
+
// }
|
|
349
|
+
// get maxEndpointingDelay(): number {
|
|
350
|
+
// return (
|
|
351
|
+
// this.agent.turnHandling?.endpointing?.maxDelay ??
|
|
352
|
+
// this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
|
|
353
|
+
// );
|
|
354
|
+
// }
|
|
230
355
|
get toolCtx() {
|
|
231
356
|
return this.agent.toolCtx;
|
|
232
357
|
}
|
|
358
|
+
/** @internal */
|
|
359
|
+
get inputStartedAt() {
|
|
360
|
+
var _a;
|
|
361
|
+
return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
|
|
362
|
+
}
|
|
233
363
|
async updateChatCtx(chatCtx) {
|
|
234
364
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
235
365
|
this.agent._chatCtx = chatCtx;
|
|
@@ -244,36 +374,79 @@ class AgentActivity {
|
|
|
244
374
|
});
|
|
245
375
|
}
|
|
246
376
|
}
|
|
247
|
-
|
|
377
|
+
// TODO: Add when AgentConfigUpdate is ported to ChatContext.
|
|
378
|
+
async updateTools(tools) {
|
|
379
|
+
this.agent._tools = { ...tools };
|
|
380
|
+
if (this.realtimeSession) {
|
|
381
|
+
await this.realtimeSession.updateTools(tools);
|
|
382
|
+
}
|
|
383
|
+
if (this.llm instanceof LLM) {
|
|
384
|
+
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
updateOptions({
|
|
388
|
+
toolChoice,
|
|
389
|
+
turnDetection
|
|
390
|
+
}) {
|
|
248
391
|
if (toolChoice !== void 0) {
|
|
249
392
|
this.toolChoice = toolChoice;
|
|
250
393
|
}
|
|
251
394
|
if (this.realtimeSession) {
|
|
252
395
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
253
396
|
}
|
|
397
|
+
if (turnDetection !== void 0) {
|
|
398
|
+
this.turnDetectionMode = turnDetection;
|
|
399
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
400
|
+
if (this.agentSession.agentState !== "speaking") {
|
|
401
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
if (this.audioRecognition) {
|
|
405
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
406
|
+
}
|
|
254
407
|
}
|
|
255
408
|
attachAudioInput(audioStream) {
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
409
|
+
void this.audioStream.close();
|
|
410
|
+
this.audioStream = new MultiInputStream();
|
|
411
|
+
const aecWarmupAudioFilter = new TransformStream({
|
|
412
|
+
transform: (frame, controller) => {
|
|
413
|
+
const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
|
|
414
|
+
if (!shouldDiscardForAecWarmup) {
|
|
415
|
+
controller.enqueue(frame);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
});
|
|
419
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
420
|
+
if (this.realtimeSession && this.audioRecognition) {
|
|
421
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
|
|
263
422
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
264
|
-
}
|
|
265
|
-
if (this.audioRecognition) {
|
|
266
423
|
this.audioRecognition.setInputAudioStream(recognitionAudioStream);
|
|
424
|
+
} else if (this.realtimeSession) {
|
|
425
|
+
this.realtimeSession.setInputAudioStream(
|
|
426
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter)
|
|
427
|
+
);
|
|
428
|
+
} else if (this.audioRecognition) {
|
|
429
|
+
this.audioRecognition.setInputAudioStream(
|
|
430
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter)
|
|
431
|
+
);
|
|
267
432
|
}
|
|
268
433
|
}
|
|
269
434
|
detachAudioInput() {
|
|
270
|
-
this.
|
|
435
|
+
if (this.audioStreamId === void 0) {
|
|
436
|
+
return;
|
|
437
|
+
}
|
|
438
|
+
void this.audioStream.close();
|
|
439
|
+
this.audioStream = new MultiInputStream();
|
|
440
|
+
this.audioStreamId = void 0;
|
|
271
441
|
}
|
|
272
|
-
commitUserTurn() {
|
|
442
|
+
commitUserTurn(options = {}) {
|
|
443
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
273
444
|
if (!this.audioRecognition) {
|
|
274
|
-
|
|
445
|
+
if (throwIfNotReady) {
|
|
446
|
+
throw new Error("AudioRecognition is not initialized");
|
|
447
|
+
}
|
|
448
|
+
return;
|
|
275
449
|
}
|
|
276
|
-
const audioDetached = false;
|
|
277
450
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
278
451
|
}
|
|
279
452
|
clearUserTurn() {
|
|
@@ -309,13 +482,11 @@ class AgentActivity {
|
|
|
309
482
|
})
|
|
310
483
|
);
|
|
311
484
|
const task = this.createSpeechTask({
|
|
312
|
-
|
|
313
|
-
(abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
|
|
314
|
-
),
|
|
485
|
+
taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
315
486
|
ownedSpeechHandle: handle,
|
|
316
487
|
name: "AgentActivity.say_tts"
|
|
317
488
|
});
|
|
318
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
489
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
319
490
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
320
491
|
return handle;
|
|
321
492
|
}
|
|
@@ -325,6 +496,14 @@ class AgentActivity {
|
|
|
325
496
|
if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
|
|
326
497
|
ev.speechId = speechHandle.id;
|
|
327
498
|
}
|
|
499
|
+
if (ev.type === "realtime_model_metrics" && this.realtimeSpans) {
|
|
500
|
+
const span = this.realtimeSpans.get(ev.requestId);
|
|
501
|
+
if (span) {
|
|
502
|
+
recordRealtimeMetrics(span, ev);
|
|
503
|
+
this.realtimeSpans.delete(ev.requestId);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
this.agentSession._usageCollector.collect(ev);
|
|
328
507
|
this.agentSession.emit(
|
|
329
508
|
AgentSessionEventTypes.MetricsCollected,
|
|
330
509
|
createMetricsCollectedEvent({ metrics: ev })
|
|
@@ -351,6 +530,13 @@ class AgentActivity {
|
|
|
351
530
|
this.logger.info("onInputSpeechStarted");
|
|
352
531
|
if (!this.vad) {
|
|
353
532
|
this.agentSession._updateUserState("speaking");
|
|
533
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
534
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
535
|
+
0,
|
|
536
|
+
Date.now(),
|
|
537
|
+
this.agentSession._userSpeakingSpan
|
|
538
|
+
);
|
|
539
|
+
}
|
|
354
540
|
}
|
|
355
541
|
try {
|
|
356
542
|
this.interrupt();
|
|
@@ -364,6 +550,9 @@ class AgentActivity {
|
|
|
364
550
|
onInputSpeechStopped(ev) {
|
|
365
551
|
this.logger.info(ev, "onInputSpeechStopped");
|
|
366
552
|
if (!this.vad) {
|
|
553
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
554
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
555
|
+
}
|
|
367
556
|
this.agentSession._updateUserState("listening");
|
|
368
557
|
}
|
|
369
558
|
if (ev.userTranscriptionEnabled) {
|
|
@@ -398,8 +587,8 @@ class AgentActivity {
|
|
|
398
587
|
if (ev.userInitiated) {
|
|
399
588
|
return;
|
|
400
589
|
}
|
|
401
|
-
if (this.
|
|
402
|
-
this.logger.warn("skipping new realtime generation, the
|
|
590
|
+
if (this.schedulingPaused) {
|
|
591
|
+
this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
|
|
403
592
|
return;
|
|
404
593
|
}
|
|
405
594
|
const handle = SpeechHandle.create({
|
|
@@ -415,45 +604,91 @@ class AgentActivity {
|
|
|
415
604
|
);
|
|
416
605
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
417
606
|
this.createSpeechTask({
|
|
418
|
-
|
|
419
|
-
(abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
|
|
420
|
-
),
|
|
607
|
+
taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
421
608
|
ownedSpeechHandle: handle,
|
|
422
609
|
name: "AgentActivity.realtimeGeneration"
|
|
423
610
|
});
|
|
424
611
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
425
612
|
}
|
|
426
613
|
// recognition hooks
|
|
427
|
-
onStartOfSpeech(
|
|
428
|
-
|
|
614
|
+
onStartOfSpeech(ev) {
|
|
615
|
+
let speechStartTime = Date.now();
|
|
616
|
+
if (ev) {
|
|
617
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
618
|
+
}
|
|
619
|
+
this.agentSession._updateUserState("speaking", {
|
|
620
|
+
lastSpeakingTime: speechStartTime,
|
|
621
|
+
otelContext: otelContext.active()
|
|
622
|
+
});
|
|
623
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
624
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
625
|
+
ev.speechDuration,
|
|
626
|
+
speechStartTime,
|
|
627
|
+
this.agentSession._userSpeakingSpan
|
|
628
|
+
);
|
|
629
|
+
}
|
|
429
630
|
}
|
|
430
|
-
onEndOfSpeech(
|
|
431
|
-
|
|
631
|
+
onEndOfSpeech(ev) {
|
|
632
|
+
let speechEndTime = Date.now();
|
|
633
|
+
if (ev) {
|
|
634
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
635
|
+
}
|
|
636
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
637
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
638
|
+
speechEndTime,
|
|
639
|
+
this.agentSession._userSpeakingSpan
|
|
640
|
+
);
|
|
641
|
+
}
|
|
642
|
+
this.agentSession._updateUserState("listening", {
|
|
643
|
+
lastSpeakingTime: speechEndTime,
|
|
644
|
+
otelContext: otelContext.active()
|
|
645
|
+
});
|
|
432
646
|
}
|
|
433
647
|
onVADInferenceDone(ev) {
|
|
434
|
-
var _a
|
|
648
|
+
var _a;
|
|
435
649
|
if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
|
|
436
650
|
return;
|
|
437
651
|
}
|
|
438
|
-
if (
|
|
652
|
+
if (ev.speechDuration >= ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
|
|
653
|
+
this.interruptByAudioActivity();
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
interruptByAudioActivity() {
|
|
657
|
+
var _a, _b, _c, _d;
|
|
658
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
659
|
+
return;
|
|
660
|
+
}
|
|
661
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
439
662
|
return;
|
|
440
663
|
}
|
|
441
|
-
if (
|
|
664
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
442
665
|
return;
|
|
443
666
|
}
|
|
444
|
-
if (this.stt && this.agentSession.
|
|
667
|
+
if (this.stt && ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
|
|
445
668
|
const text = this.audioRecognition.currentTranscript;
|
|
446
|
-
|
|
669
|
+
const normalizedText = text ?? "";
|
|
670
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
671
|
+
if (wordCount < ((_b = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
447
672
|
return;
|
|
448
673
|
}
|
|
449
674
|
}
|
|
450
|
-
(
|
|
675
|
+
(_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
|
|
451
676
|
if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
|
|
452
|
-
this.logger.info(
|
|
453
|
-
|
|
677
|
+
this.logger.info(
|
|
678
|
+
{ "speech id": this._currentSpeech.id },
|
|
679
|
+
"speech interrupted by audio activity"
|
|
680
|
+
);
|
|
681
|
+
(_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
|
|
454
682
|
this._currentSpeech.interrupt();
|
|
455
683
|
}
|
|
456
684
|
}
|
|
685
|
+
onInterruption(ev) {
|
|
686
|
+
this.restoreInterruptionByAudioActivity();
|
|
687
|
+
this.interruptByAudioActivity();
|
|
688
|
+
if (this.audioRecognition) {
|
|
689
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
|
|
690
|
+
}
|
|
691
|
+
}
|
|
457
692
|
onInterimTranscript(ev) {
|
|
458
693
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
459
694
|
return;
|
|
@@ -462,10 +697,14 @@ class AgentActivity {
|
|
|
462
697
|
AgentSessionEventTypes.UserInputTranscribed,
|
|
463
698
|
createUserInputTranscribedEvent({
|
|
464
699
|
transcript: ev.alternatives[0].text,
|
|
465
|
-
isFinal: false
|
|
700
|
+
isFinal: false,
|
|
701
|
+
language: ev.alternatives[0].language
|
|
466
702
|
// TODO(AJS-106): add multi participant support
|
|
467
703
|
})
|
|
468
704
|
);
|
|
705
|
+
if (ev.alternatives[0].text) {
|
|
706
|
+
this.interruptByAudioActivity();
|
|
707
|
+
}
|
|
469
708
|
}
|
|
470
709
|
onFinalTranscript(ev) {
|
|
471
710
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
@@ -475,13 +714,70 @@ class AgentActivity {
|
|
|
475
714
|
AgentSessionEventTypes.UserInputTranscribed,
|
|
476
715
|
createUserInputTranscribedEvent({
|
|
477
716
|
transcript: ev.alternatives[0].text,
|
|
478
|
-
isFinal: true
|
|
717
|
+
isFinal: true,
|
|
718
|
+
language: ev.alternatives[0].language
|
|
479
719
|
// TODO(AJS-106): add multi participant support
|
|
480
720
|
})
|
|
481
721
|
);
|
|
722
|
+
if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
|
|
723
|
+
this.interruptByAudioActivity();
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
onPreemptiveGeneration(info) {
|
|
727
|
+
if (!this.agentSession.sessionOptions.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
|
|
728
|
+
return;
|
|
729
|
+
}
|
|
730
|
+
this.cancelPreemptiveGeneration();
|
|
731
|
+
this.logger.info(
|
|
732
|
+
{
|
|
733
|
+
newTranscript: info.newTranscript,
|
|
734
|
+
transcriptConfidence: info.transcriptConfidence
|
|
735
|
+
},
|
|
736
|
+
"starting preemptive generation"
|
|
737
|
+
);
|
|
738
|
+
const userMessage = ChatMessage.create({
|
|
739
|
+
role: "user",
|
|
740
|
+
content: info.newTranscript,
|
|
741
|
+
transcriptConfidence: info.transcriptConfidence
|
|
742
|
+
});
|
|
743
|
+
const chatCtx = this.agent.chatCtx.copy();
|
|
744
|
+
const speechHandle = this.generateReply({
|
|
745
|
+
userMessage,
|
|
746
|
+
chatCtx,
|
|
747
|
+
scheduleSpeech: false
|
|
748
|
+
});
|
|
749
|
+
this._preemptiveGeneration = {
|
|
750
|
+
speechHandle,
|
|
751
|
+
userMessage,
|
|
752
|
+
info,
|
|
753
|
+
chatCtx: chatCtx.copy(),
|
|
754
|
+
tools: { ...this.tools },
|
|
755
|
+
toolChoice: this.toolChoice,
|
|
756
|
+
createdAt: Date.now()
|
|
757
|
+
};
|
|
758
|
+
}
|
|
759
|
+
cancelPreemptiveGeneration() {
|
|
760
|
+
if (this._preemptiveGeneration !== void 0) {
|
|
761
|
+
this._preemptiveGeneration.speechHandle._cancel();
|
|
762
|
+
this._preemptiveGeneration = void 0;
|
|
763
|
+
}
|
|
482
764
|
}
|
|
483
765
|
createSpeechTask(options) {
|
|
484
|
-
const {
|
|
766
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
767
|
+
const wrappedFn = (ctrl) => {
|
|
768
|
+
return agentActivityStorage.run(this, () => {
|
|
769
|
+
const currentTask = Task.current();
|
|
770
|
+
if (currentTask) {
|
|
771
|
+
_setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
772
|
+
}
|
|
773
|
+
if (ownedSpeechHandle) {
|
|
774
|
+
return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
775
|
+
}
|
|
776
|
+
return taskFn(ctrl);
|
|
777
|
+
});
|
|
778
|
+
};
|
|
779
|
+
const task = Task.from(wrappedFn, controller, name);
|
|
780
|
+
_setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
485
781
|
this.speechTasks.add(task);
|
|
486
782
|
task.addDoneCallback(() => {
|
|
487
783
|
this.speechTasks.delete(task);
|
|
@@ -497,20 +793,35 @@ class AgentActivity {
|
|
|
497
793
|
task.addDoneCallback(() => {
|
|
498
794
|
this.wakeupMainTask();
|
|
499
795
|
});
|
|
500
|
-
return task
|
|
796
|
+
return task;
|
|
501
797
|
}
|
|
502
798
|
async onEndOfTurn(info) {
|
|
503
|
-
|
|
504
|
-
|
|
799
|
+
var _a, _b;
|
|
800
|
+
if (this.schedulingPaused) {
|
|
801
|
+
this.cancelPreemptiveGeneration();
|
|
802
|
+
this.logger.warn(
|
|
803
|
+
{ user_input: info.newTranscript },
|
|
804
|
+
"skipping user input, speech scheduling is paused"
|
|
805
|
+
);
|
|
505
806
|
return true;
|
|
506
807
|
}
|
|
507
|
-
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.
|
|
508
|
-
|
|
509
|
-
|
|
808
|
+
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
|
|
809
|
+
const wordCount = splitWords(info.newTranscript, true).length;
|
|
810
|
+
if (wordCount < ((_b = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
811
|
+
this.cancelPreemptiveGeneration();
|
|
812
|
+
this.logger.info(
|
|
813
|
+
{
|
|
814
|
+
wordCount,
|
|
815
|
+
minInterruptionWords: this.agentSession.sessionOptions.turnHandling.interruption.minWords
|
|
816
|
+
},
|
|
817
|
+
"skipping user input, word count below minimum interruption threshold"
|
|
818
|
+
);
|
|
819
|
+
return false;
|
|
820
|
+
}
|
|
510
821
|
}
|
|
511
822
|
const oldTask = this._userTurnCompletedTask;
|
|
512
823
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
513
|
-
|
|
824
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
514
825
|
name: "AgentActivity.userTurnCompleted"
|
|
515
826
|
});
|
|
516
827
|
return true;
|
|
@@ -535,19 +846,49 @@ class AgentActivity {
|
|
|
535
846
|
throw new Error("Speech queue is empty");
|
|
536
847
|
}
|
|
537
848
|
const speechHandle = heapItem[2];
|
|
849
|
+
if (speechHandle.interrupted || speechHandle.done()) {
|
|
850
|
+
continue;
|
|
851
|
+
}
|
|
538
852
|
this._currentSpeech = speechHandle;
|
|
539
853
|
speechHandle._authorizeGeneration();
|
|
540
|
-
await speechHandle._waitForGeneration();
|
|
854
|
+
await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
|
|
541
855
|
this._currentSpeech = void 0;
|
|
542
856
|
}
|
|
543
|
-
|
|
544
|
-
|
|
857
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
858
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
859
|
+
this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
|
|
545
860
|
break;
|
|
546
861
|
}
|
|
547
862
|
this.q_updated = new Future();
|
|
548
863
|
}
|
|
549
864
|
this.logger.info("AgentActivity mainTask: exiting");
|
|
550
865
|
}
|
|
866
|
+
getDrainPendingSpeechTasks() {
|
|
867
|
+
const blockedHandles = [];
|
|
868
|
+
for (const task of this._drainBlockedTasks) {
|
|
869
|
+
const info = _getActivityTaskInfo(task);
|
|
870
|
+
if (!info) {
|
|
871
|
+
this.logger.error("blocked task without activity info; skipping.");
|
|
872
|
+
continue;
|
|
873
|
+
}
|
|
874
|
+
if (!info.speechHandle) {
|
|
875
|
+
continue;
|
|
876
|
+
}
|
|
877
|
+
blockedHandles.push(info.speechHandle);
|
|
878
|
+
}
|
|
879
|
+
const toWait = [];
|
|
880
|
+
for (const task of this.speechTasks) {
|
|
881
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
882
|
+
continue;
|
|
883
|
+
}
|
|
884
|
+
const info = _getActivityTaskInfo(task);
|
|
885
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
886
|
+
continue;
|
|
887
|
+
}
|
|
888
|
+
toWait.push(task);
|
|
889
|
+
}
|
|
890
|
+
return toWait;
|
|
891
|
+
}
|
|
551
892
|
wakeupMainTask() {
|
|
552
893
|
this.q_updated.resolve();
|
|
553
894
|
}
|
|
@@ -558,7 +899,8 @@ class AgentActivity {
|
|
|
558
899
|
chatCtx,
|
|
559
900
|
instructions: defaultInstructions,
|
|
560
901
|
toolChoice: defaultToolChoice,
|
|
561
|
-
allowInterruptions: defaultAllowInterruptions
|
|
902
|
+
allowInterruptions: defaultAllowInterruptions,
|
|
903
|
+
scheduleSpeech = true
|
|
562
904
|
} = options;
|
|
563
905
|
let instructions = defaultInstructions;
|
|
564
906
|
let toolChoice = defaultToolChoice;
|
|
@@ -572,7 +914,7 @@ class AgentActivity {
|
|
|
572
914
|
if (this.llm === void 0) {
|
|
573
915
|
throw new Error("trying to generate reply without an LLM model");
|
|
574
916
|
}
|
|
575
|
-
const functionCall = (_a =
|
|
917
|
+
const functionCall = (_a = functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
|
|
576
918
|
if (toolChoice === void 0 && functionCall !== void 0) {
|
|
577
919
|
toolChoice = "none";
|
|
578
920
|
}
|
|
@@ -590,19 +932,17 @@ class AgentActivity {
|
|
|
590
932
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
591
933
|
if (this.llm instanceof RealtimeModel) {
|
|
592
934
|
this.createSpeechTask({
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
})
|
|
605
|
-
),
|
|
935
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
936
|
+
speechHandle: handle,
|
|
937
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
938
|
+
userInput: userMessage == null ? void 0 : userMessage.textContent,
|
|
939
|
+
instructions,
|
|
940
|
+
modelSettings: {
|
|
941
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
942
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
943
|
+
},
|
|
944
|
+
abortController
|
|
945
|
+
}),
|
|
606
946
|
ownedSpeechHandle: handle,
|
|
607
947
|
name: "AgentActivity.realtimeReply"
|
|
608
948
|
});
|
|
@@ -611,39 +951,56 @@ class AgentActivity {
|
|
|
611
951
|
instructions = `${this.agent.instructions}
|
|
612
952
|
${instructions}`;
|
|
613
953
|
}
|
|
954
|
+
const onEnterData = onEnterStorage.getStore();
|
|
955
|
+
const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
|
|
956
|
+
const tools = shouldFilterTools ? Object.fromEntries(
|
|
957
|
+
Object.entries(this.agent.toolCtx).filter(
|
|
958
|
+
([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER)
|
|
959
|
+
)
|
|
960
|
+
) : this.agent.toolCtx;
|
|
614
961
|
const task = this.createSpeechTask({
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
${instructions}` : instructions,
|
|
626
|
-
userMessage
|
|
627
|
-
)
|
|
962
|
+
taskFn: (abortController) => this.pipelineReplyTask(
|
|
963
|
+
handle,
|
|
964
|
+
chatCtx ?? this.agent.chatCtx,
|
|
965
|
+
tools,
|
|
966
|
+
{
|
|
967
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
968
|
+
},
|
|
969
|
+
abortController,
|
|
970
|
+
instructions,
|
|
971
|
+
userMessage
|
|
628
972
|
),
|
|
629
973
|
ownedSpeechHandle: handle,
|
|
630
974
|
name: "AgentActivity.pipelineReply"
|
|
631
975
|
});
|
|
632
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
976
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
977
|
+
}
|
|
978
|
+
if (scheduleSpeech) {
|
|
979
|
+
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
633
980
|
}
|
|
634
|
-
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
635
981
|
return handle;
|
|
636
982
|
}
|
|
637
|
-
interrupt() {
|
|
983
|
+
interrupt(options = {}) {
|
|
638
984
|
var _a;
|
|
985
|
+
const { force = false } = options;
|
|
986
|
+
this.cancelPreemptiveGeneration();
|
|
639
987
|
const future = new Future();
|
|
640
988
|
const currentSpeech = this._currentSpeech;
|
|
641
|
-
currentSpeech == null ? void 0 : currentSpeech.interrupt();
|
|
989
|
+
currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
|
|
642
990
|
for (const [_, __, speech] of this.speechQueue) {
|
|
643
|
-
speech.interrupt();
|
|
991
|
+
speech.interrupt(force);
|
|
644
992
|
}
|
|
645
993
|
(_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
|
|
646
|
-
if (
|
|
994
|
+
if (force) {
|
|
995
|
+
for (const task of this.speechTasks) {
|
|
996
|
+
task.cancel();
|
|
997
|
+
}
|
|
998
|
+
if (currentSpeech && !currentSpeech.done()) {
|
|
999
|
+
currentSpeech._markDone();
|
|
1000
|
+
}
|
|
1001
|
+
this.speechQueue.clear();
|
|
1002
|
+
future.resolve();
|
|
1003
|
+
} else if (currentSpeech === void 0) {
|
|
647
1004
|
future.resolve();
|
|
648
1005
|
} else {
|
|
649
1006
|
currentSpeech.addDoneCallback(() => {
|
|
@@ -661,7 +1018,7 @@ ${instructions}` : instructions,
|
|
|
661
1018
|
async userTurnCompleted(info, oldTask) {
|
|
662
1019
|
var _a, _b;
|
|
663
1020
|
if (oldTask) {
|
|
664
|
-
await oldTask;
|
|
1021
|
+
await oldTask.result;
|
|
665
1022
|
}
|
|
666
1023
|
if (this.llm instanceof RealtimeModel) {
|
|
667
1024
|
if (this.llm.capabilities.turnDetection) {
|
|
@@ -686,7 +1043,8 @@ ${instructions}` : instructions,
|
|
|
686
1043
|
}
|
|
687
1044
|
let userMessage = ChatMessage.create({
|
|
688
1045
|
role: "user",
|
|
689
|
-
content: info.newTranscript
|
|
1046
|
+
content: info.newTranscript,
|
|
1047
|
+
transcriptConfidence: info.transcriptConfidence
|
|
690
1048
|
});
|
|
691
1049
|
const chatCtx = this.agent.chatCtx.copy();
|
|
692
1050
|
const startTime = Date.now();
|
|
@@ -704,13 +1062,57 @@ ${instructions}` : instructions,
|
|
|
704
1062
|
} else if (this.llm === void 0) {
|
|
705
1063
|
return;
|
|
706
1064
|
}
|
|
707
|
-
const
|
|
1065
|
+
const userMetricsReport = {};
|
|
1066
|
+
if (info.startedSpeakingAt !== void 0) {
|
|
1067
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
|
|
1068
|
+
}
|
|
1069
|
+
if (info.stoppedSpeakingAt !== void 0) {
|
|
1070
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
|
|
1071
|
+
}
|
|
1072
|
+
if (info.transcriptionDelay !== void 0) {
|
|
1073
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
|
|
1074
|
+
}
|
|
1075
|
+
if (info.endOfUtteranceDelay !== void 0) {
|
|
1076
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
|
|
1077
|
+
}
|
|
1078
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
|
|
1079
|
+
if (userMessage) {
|
|
1080
|
+
userMessage.metrics = userMetricsReport;
|
|
1081
|
+
}
|
|
1082
|
+
let speechHandle;
|
|
1083
|
+
if (this._preemptiveGeneration !== void 0) {
|
|
1084
|
+
const preemptive = this._preemptiveGeneration;
|
|
1085
|
+
if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && isSameToolContext(preemptive.tools, this.tools) && isSameToolChoice(preemptive.toolChoice, this.toolChoice)) {
|
|
1086
|
+
speechHandle = preemptive.speechHandle;
|
|
1087
|
+
if (preemptive.userMessage && userMessage) {
|
|
1088
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1089
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1090
|
+
}
|
|
1091
|
+
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
1092
|
+
this.logger.debug(
|
|
1093
|
+
{
|
|
1094
|
+
preemptiveLeadTime: Date.now() - preemptive.createdAt
|
|
1095
|
+
},
|
|
1096
|
+
"using preemptive generation"
|
|
1097
|
+
);
|
|
1098
|
+
} else {
|
|
1099
|
+
this.logger.warn(
|
|
1100
|
+
"preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`"
|
|
1101
|
+
);
|
|
1102
|
+
preemptive.speechHandle._cancel();
|
|
1103
|
+
}
|
|
1104
|
+
this._preemptiveGeneration = void 0;
|
|
1105
|
+
}
|
|
1106
|
+
if (speechHandle === void 0) {
|
|
1107
|
+
speechHandle = this.generateReply({ userMessage, chatCtx });
|
|
1108
|
+
}
|
|
708
1109
|
const eouMetrics = {
|
|
709
1110
|
type: "eou_metrics",
|
|
710
1111
|
timestamp: Date.now(),
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
1112
|
+
endOfUtteranceDelayMs: info.endOfUtteranceDelay,
|
|
1113
|
+
transcriptionDelayMs: info.transcriptionDelay,
|
|
1114
|
+
onUserTurnCompletedDelayMs: callbackDuration,
|
|
1115
|
+
lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
|
|
714
1116
|
speechId: speechHandle.id
|
|
715
1117
|
};
|
|
716
1118
|
this.agentSession.emit(
|
|
@@ -719,6 +1121,8 @@ ${instructions}` : instructions,
|
|
|
719
1121
|
);
|
|
720
1122
|
}
|
|
721
1123
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
1124
|
+
var _a, _b;
|
|
1125
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
722
1126
|
speechHandleStorage.enterWith(speechHandle);
|
|
723
1127
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
724
1128
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
@@ -750,25 +1154,38 @@ ${instructions}` : instructions,
|
|
|
750
1154
|
textOut = _textOut;
|
|
751
1155
|
tasks.push(textForwardTask);
|
|
752
1156
|
}
|
|
753
|
-
|
|
754
|
-
|
|
1157
|
+
let replyStartedSpeakingAt;
|
|
1158
|
+
let replyTtsGenData = null;
|
|
1159
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1160
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1161
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1162
|
+
startTime: startedSpeakingAt,
|
|
1163
|
+
otelContext: speechHandle._agentTurnContext
|
|
1164
|
+
});
|
|
1165
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1166
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1167
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1168
|
+
}
|
|
755
1169
|
};
|
|
756
1170
|
if (!audioOutput) {
|
|
757
1171
|
if (textOut) {
|
|
758
|
-
textOut.firstTextFut.await.
|
|
1172
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
759
1173
|
}
|
|
760
1174
|
} else {
|
|
761
1175
|
let audioOut = null;
|
|
762
1176
|
if (!audio) {
|
|
763
|
-
const [ttsTask,
|
|
1177
|
+
const [ttsTask, ttsGenData] = performTTSInference(
|
|
764
1178
|
(...args) => this.agent.ttsNode(...args),
|
|
765
1179
|
audioSource,
|
|
766
1180
|
modelSettings,
|
|
767
|
-
replyAbortController
|
|
1181
|
+
replyAbortController,
|
|
1182
|
+
(_a = this.tts) == null ? void 0 : _a.model,
|
|
1183
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
768
1184
|
);
|
|
769
1185
|
tasks.push(ttsTask);
|
|
1186
|
+
replyTtsGenData = ttsGenData;
|
|
770
1187
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
771
|
-
|
|
1188
|
+
ttsGenData.audioStream,
|
|
772
1189
|
audioOutput,
|
|
773
1190
|
replyAbortController
|
|
774
1191
|
);
|
|
@@ -783,7 +1200,7 @@ ${instructions}` : instructions,
|
|
|
783
1200
|
tasks.push(forwardTask);
|
|
784
1201
|
audioOut = _audioOut;
|
|
785
1202
|
}
|
|
786
|
-
audioOut.firstFrameFut.await.
|
|
1203
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
787
1204
|
}
|
|
788
1205
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
789
1206
|
if (audioOutput) {
|
|
@@ -798,28 +1215,63 @@ ${instructions}` : instructions,
|
|
|
798
1215
|
}
|
|
799
1216
|
}
|
|
800
1217
|
if (addToChatCtx) {
|
|
1218
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1219
|
+
const replyAssistantMetrics = {};
|
|
1220
|
+
if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
|
|
1221
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1222
|
+
}
|
|
1223
|
+
if (replyStartedSpeakingAt !== void 0) {
|
|
1224
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
|
|
1225
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
|
|
1226
|
+
}
|
|
801
1227
|
const message = ChatMessage.create({
|
|
802
1228
|
role: "assistant",
|
|
803
1229
|
content: (textOut == null ? void 0 : textOut.text) || "",
|
|
804
|
-
interrupted: speechHandle.interrupted
|
|
1230
|
+
interrupted: speechHandle.interrupted,
|
|
1231
|
+
metrics: replyAssistantMetrics
|
|
805
1232
|
});
|
|
806
1233
|
this.agent._chatCtx.insert(message);
|
|
807
1234
|
this.agentSession._conversationItemAdded(message);
|
|
808
1235
|
}
|
|
809
1236
|
if (this.agentSession.agentState === "speaking") {
|
|
810
1237
|
this.agentSession._updateAgentState("listening");
|
|
1238
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1239
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1240
|
+
}
|
|
1241
|
+
this.restoreInterruptionByAudioActivity();
|
|
811
1242
|
}
|
|
812
1243
|
}
|
|
813
|
-
async
|
|
814
|
-
|
|
1244
|
+
_pipelineReplyTaskImpl = async ({
|
|
1245
|
+
speechHandle,
|
|
1246
|
+
chatCtx,
|
|
1247
|
+
toolCtx,
|
|
1248
|
+
modelSettings,
|
|
1249
|
+
replyAbortController,
|
|
1250
|
+
instructions,
|
|
1251
|
+
newMessage,
|
|
1252
|
+
toolsMessages,
|
|
1253
|
+
span,
|
|
1254
|
+
_previousUserMetrics
|
|
1255
|
+
}) => {
|
|
1256
|
+
var _a, _b, _c, _d, _e, _f;
|
|
1257
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1258
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1259
|
+
if (instructions) {
|
|
1260
|
+
span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
|
|
1261
|
+
}
|
|
1262
|
+
if (newMessage) {
|
|
1263
|
+
span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || "");
|
|
1264
|
+
}
|
|
1265
|
+
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1266
|
+
if (localParticipant) {
|
|
1267
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1268
|
+
}
|
|
815
1269
|
speechHandleStorage.enterWith(speechHandle);
|
|
816
1270
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
817
1271
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
818
1272
|
chatCtx = chatCtx.copy();
|
|
819
1273
|
if (newMessage) {
|
|
820
1274
|
chatCtx.insert(newMessage);
|
|
821
|
-
this.agent._chatCtx.insert(newMessage);
|
|
822
|
-
this.agentSession._conversationItemAdded(newMessage);
|
|
823
1275
|
}
|
|
824
1276
|
if (instructions) {
|
|
825
1277
|
try {
|
|
@@ -832,7 +1284,6 @@ ${instructions}` : instructions,
|
|
|
832
1284
|
this.logger.error({ error: e }, "error occurred during updateInstructions");
|
|
833
1285
|
}
|
|
834
1286
|
}
|
|
835
|
-
this.agentSession._updateAgentState("thinking");
|
|
836
1287
|
const tasks = [];
|
|
837
1288
|
const [llmTask, llmGenData] = performLLMInference(
|
|
838
1289
|
// preserve `this` context in llmNode
|
|
@@ -840,22 +1291,36 @@ ${instructions}` : instructions,
|
|
|
840
1291
|
chatCtx,
|
|
841
1292
|
toolCtx,
|
|
842
1293
|
modelSettings,
|
|
843
|
-
replyAbortController
|
|
1294
|
+
replyAbortController,
|
|
1295
|
+
(_b = this.llm) == null ? void 0 : _b.model,
|
|
1296
|
+
(_c = this.llm) == null ? void 0 : _c.provider
|
|
844
1297
|
);
|
|
845
1298
|
tasks.push(llmTask);
|
|
846
|
-
const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
|
|
847
1299
|
let ttsTask = null;
|
|
848
|
-
let
|
|
1300
|
+
let ttsGenData = null;
|
|
1301
|
+
let llmOutput;
|
|
849
1302
|
if (audioOutput) {
|
|
850
|
-
[
|
|
1303
|
+
const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
|
|
1304
|
+
llmOutput = textOutput;
|
|
1305
|
+
[ttsTask, ttsGenData] = performTTSInference(
|
|
851
1306
|
(...args) => this.agent.ttsNode(...args),
|
|
852
1307
|
ttsTextInput,
|
|
853
1308
|
modelSettings,
|
|
854
|
-
replyAbortController
|
|
1309
|
+
replyAbortController,
|
|
1310
|
+
(_d = this.tts) == null ? void 0 : _d.model,
|
|
1311
|
+
(_e = this.tts) == null ? void 0 : _e.provider
|
|
855
1312
|
);
|
|
856
1313
|
tasks.push(ttsTask);
|
|
1314
|
+
} else {
|
|
1315
|
+
llmOutput = llmGenData.textStream;
|
|
857
1316
|
}
|
|
858
1317
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1318
|
+
let userMetrics = _previousUserMetrics;
|
|
1319
|
+
if (newMessage && speechHandle.scheduled) {
|
|
1320
|
+
this.agent._chatCtx.insert(newMessage);
|
|
1321
|
+
this.agentSession._conversationItemAdded(newMessage);
|
|
1322
|
+
userMetrics = newMessage.metrics;
|
|
1323
|
+
}
|
|
859
1324
|
if (speechHandle.interrupted) {
|
|
860
1325
|
replyAbortController.abort();
|
|
861
1326
|
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
@@ -865,7 +1330,20 @@ ${instructions}` : instructions,
|
|
|
865
1330
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
866
1331
|
speechHandle._clearAuthorization();
|
|
867
1332
|
const replyStartedAt = Date.now();
|
|
868
|
-
|
|
1333
|
+
let transcriptionInput = llmOutput;
|
|
1334
|
+
if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
|
|
1335
|
+
const timedTextsStream = await Promise.race([
|
|
1336
|
+
ttsGenData.timedTextsFut.await,
|
|
1337
|
+
(ttsTask == null ? void 0 : ttsTask.result.catch(
|
|
1338
|
+
() => this.logger.warn("TTS task failed before resolving timedTextsFut")
|
|
1339
|
+
)) ?? Promise.resolve()
|
|
1340
|
+
]);
|
|
1341
|
+
if (timedTextsStream) {
|
|
1342
|
+
this.logger.debug("Using TTS aligned transcripts for transcription node input");
|
|
1343
|
+
transcriptionInput = timedTextsStream;
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
|
|
869
1347
|
let textOut = null;
|
|
870
1348
|
if (trNodeResult) {
|
|
871
1349
|
const [textForwardTask, _textOut] = performTextForwarding(
|
|
@@ -876,29 +1354,44 @@ ${instructions}` : instructions,
|
|
|
876
1354
|
tasks.push(textForwardTask);
|
|
877
1355
|
textOut = _textOut;
|
|
878
1356
|
}
|
|
879
|
-
|
|
880
|
-
|
|
1357
|
+
let agentStartedSpeakingAt;
|
|
1358
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1359
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1360
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1361
|
+
startTime: startedSpeakingAt,
|
|
1362
|
+
otelContext: speechHandle._agentTurnContext
|
|
1363
|
+
});
|
|
1364
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1365
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1366
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1367
|
+
}
|
|
881
1368
|
};
|
|
882
1369
|
let audioOut = null;
|
|
883
1370
|
if (audioOutput) {
|
|
884
|
-
if (
|
|
1371
|
+
if (ttsGenData) {
|
|
885
1372
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
886
|
-
|
|
1373
|
+
ttsGenData.audioStream,
|
|
887
1374
|
audioOutput,
|
|
888
1375
|
replyAbortController
|
|
889
1376
|
);
|
|
890
1377
|
audioOut = _audioOut;
|
|
891
1378
|
tasks.push(forwardTask);
|
|
892
|
-
audioOut.firstFrameFut.await.
|
|
1379
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
893
1380
|
} else {
|
|
894
|
-
throw Error("
|
|
1381
|
+
throw Error("ttsGenData is null when audioOutput is enabled");
|
|
895
1382
|
}
|
|
896
1383
|
} else {
|
|
897
|
-
textOut == null ? void 0 : textOut.firstTextFut.await.
|
|
1384
|
+
textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
898
1385
|
}
|
|
899
|
-
const onToolExecutionStarted = (
|
|
1386
|
+
const onToolExecutionStarted = (f) => {
|
|
1387
|
+
speechHandle._itemAdded([f]);
|
|
1388
|
+
this.agent._chatCtx.items.push(f);
|
|
1389
|
+
this.agentSession._toolItemsAdded([f]);
|
|
900
1390
|
};
|
|
901
|
-
const onToolExecutionCompleted = (
|
|
1391
|
+
const onToolExecutionCompleted = (out) => {
|
|
1392
|
+
if (out.toolCallOutput) {
|
|
1393
|
+
speechHandle._itemAdded([out.toolCallOutput]);
|
|
1394
|
+
}
|
|
902
1395
|
};
|
|
903
1396
|
const [executeToolsTask, toolOutput] = performToolExecutions({
|
|
904
1397
|
session: this.agentSession,
|
|
@@ -914,28 +1407,53 @@ ${instructions}` : instructions,
|
|
|
914
1407
|
if (audioOutput) {
|
|
915
1408
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
916
1409
|
}
|
|
1410
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1411
|
+
const assistantMetrics = {};
|
|
1412
|
+
if (llmGenData.ttft !== void 0) {
|
|
1413
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft;
|
|
1414
|
+
}
|
|
1415
|
+
if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
|
|
1416
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
|
|
1417
|
+
}
|
|
1418
|
+
if (agentStartedSpeakingAt !== void 0) {
|
|
1419
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
|
|
1420
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
|
|
1421
|
+
if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
|
|
1422
|
+
const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
|
|
1423
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1424
|
+
span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1428
|
+
let hasSpeechMessage = false;
|
|
917
1429
|
if (toolsMessages) {
|
|
918
1430
|
for (const msg of toolsMessages) {
|
|
919
1431
|
msg.createdAt = replyStartedAt;
|
|
920
1432
|
}
|
|
921
|
-
|
|
1433
|
+
const toolCallOutputs = toolsMessages.filter(
|
|
1434
|
+
(m) => m.type === "function_call_output"
|
|
1435
|
+
);
|
|
1436
|
+
if (toolCallOutputs.length > 0) {
|
|
1437
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1438
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1439
|
+
}
|
|
922
1440
|
}
|
|
923
1441
|
if (speechHandle.interrupted) {
|
|
924
1442
|
this.logger.debug(
|
|
925
1443
|
{ speech_id: speechHandle.id },
|
|
926
1444
|
"Aborting all pipeline reply tasks due to interruption"
|
|
927
1445
|
);
|
|
1446
|
+
if (audioOutput) {
|
|
1447
|
+
audioOutput.clearBuffer();
|
|
1448
|
+
}
|
|
928
1449
|
replyAbortController.abort();
|
|
929
|
-
await
|
|
930
|
-
tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT))
|
|
931
|
-
);
|
|
1450
|
+
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
932
1451
|
let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
|
|
933
1452
|
if (audioOutput) {
|
|
934
|
-
audioOutput.clearBuffer();
|
|
935
1453
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
936
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1454
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
937
1455
|
this.logger.info(
|
|
938
|
-
{ speech_id: speechHandle.id,
|
|
1456
|
+
{ speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
|
|
939
1457
|
"playout interrupted"
|
|
940
1458
|
);
|
|
941
1459
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -946,19 +1464,27 @@ ${instructions}` : instructions,
|
|
|
946
1464
|
}
|
|
947
1465
|
}
|
|
948
1466
|
if (forwardedText) {
|
|
1467
|
+
hasSpeechMessage = true;
|
|
949
1468
|
const message = ChatMessage.create({
|
|
950
1469
|
role: "assistant",
|
|
951
1470
|
content: forwardedText,
|
|
952
1471
|
id: llmGenData.id,
|
|
953
1472
|
interrupted: true,
|
|
954
|
-
createdAt: replyStartedAt
|
|
1473
|
+
createdAt: replyStartedAt,
|
|
1474
|
+
metrics: assistantMetrics
|
|
955
1475
|
});
|
|
956
1476
|
chatCtx.insert(message);
|
|
957
1477
|
this.agent._chatCtx.insert(message);
|
|
1478
|
+
speechHandle._itemAdded([message]);
|
|
958
1479
|
this.agentSession._conversationItemAdded(message);
|
|
1480
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
959
1481
|
}
|
|
960
1482
|
if (this.agentSession.agentState === "speaking") {
|
|
961
1483
|
this.agentSession._updateAgentState("listening");
|
|
1484
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1485
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1486
|
+
this.restoreInterruptionByAudioActivity();
|
|
1487
|
+
}
|
|
962
1488
|
}
|
|
963
1489
|
this.logger.info(
|
|
964
1490
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
@@ -969,16 +1495,20 @@ ${instructions}` : instructions,
|
|
|
969
1495
|
return;
|
|
970
1496
|
}
|
|
971
1497
|
if (textOut && textOut.text) {
|
|
1498
|
+
hasSpeechMessage = true;
|
|
972
1499
|
const message = ChatMessage.create({
|
|
973
1500
|
role: "assistant",
|
|
974
1501
|
id: llmGenData.id,
|
|
975
1502
|
interrupted: false,
|
|
976
1503
|
createdAt: replyStartedAt,
|
|
977
|
-
content: textOut.text
|
|
1504
|
+
content: textOut.text,
|
|
1505
|
+
metrics: assistantMetrics
|
|
978
1506
|
});
|
|
979
1507
|
chatCtx.insert(message);
|
|
980
1508
|
this.agent._chatCtx.insert(message);
|
|
1509
|
+
speechHandle._itemAdded([message]);
|
|
981
1510
|
this.agentSession._conversationItemAdded(message);
|
|
1511
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
982
1512
|
this.logger.info(
|
|
983
1513
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
984
1514
|
"playout completed without interruption"
|
|
@@ -988,11 +1518,17 @@ ${instructions}` : instructions,
|
|
|
988
1518
|
this.agentSession._updateAgentState("thinking");
|
|
989
1519
|
} else if (this.agentSession.agentState === "speaking") {
|
|
990
1520
|
this.agentSession._updateAgentState("listening");
|
|
1521
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1522
|
+
{
|
|
1523
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1524
|
+
this.restoreInterruptionByAudioActivity();
|
|
1525
|
+
}
|
|
1526
|
+
}
|
|
991
1527
|
}
|
|
992
1528
|
speechHandle._markGenerationDone();
|
|
993
1529
|
await executeToolsTask.result;
|
|
994
1530
|
if (toolOutput.output.length === 0) return;
|
|
995
|
-
const { maxToolSteps } = this.agentSession.
|
|
1531
|
+
const { maxToolSteps } = this.agentSession.sessionOptions;
|
|
996
1532
|
if (speechHandle.numSteps >= maxToolSteps) {
|
|
997
1533
|
this.logger.warn(
|
|
998
1534
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
@@ -1000,45 +1536,15 @@ ${instructions}` : instructions,
|
|
|
1000
1536
|
);
|
|
1001
1537
|
return;
|
|
1002
1538
|
}
|
|
1003
|
-
const functionToolsExecutedEvent =
|
|
1004
|
-
functionCalls: [],
|
|
1005
|
-
functionCallOutputs: []
|
|
1006
|
-
});
|
|
1007
|
-
let shouldGenerateToolReply = false;
|
|
1008
|
-
let newAgentTask = null;
|
|
1009
|
-
let ignoreTaskSwitch = false;
|
|
1010
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1011
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1012
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1013
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1014
|
-
if (sanitizedOut.replyRequired) {
|
|
1015
|
-
shouldGenerateToolReply = true;
|
|
1016
|
-
}
|
|
1017
|
-
}
|
|
1018
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1019
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1020
|
-
ignoreTaskSwitch = true;
|
|
1021
|
-
}
|
|
1022
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1023
|
-
this.logger.debug(
|
|
1024
|
-
{
|
|
1025
|
-
speechId: speechHandle.id,
|
|
1026
|
-
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1027
|
-
args: sanitizedOut.toolCall.args,
|
|
1028
|
-
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1029
|
-
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1030
|
-
},
|
|
1031
|
-
"Tool call execution finished"
|
|
1032
|
-
);
|
|
1033
|
-
}
|
|
1539
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1034
1540
|
this.agentSession.emit(
|
|
1035
1541
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1036
1542
|
functionToolsExecutedEvent
|
|
1037
1543
|
);
|
|
1038
|
-
let
|
|
1544
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1039
1545
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1040
1546
|
this.agentSession.updateAgent(newAgentTask);
|
|
1041
|
-
|
|
1547
|
+
schedulingPaused = true;
|
|
1042
1548
|
}
|
|
1043
1549
|
const toolMessages = [
|
|
1044
1550
|
...functionToolsExecutedEvent.functionCalls,
|
|
@@ -1046,47 +1552,85 @@ ${instructions}` : instructions,
|
|
|
1046
1552
|
];
|
|
1047
1553
|
if (shouldGenerateToolReply) {
|
|
1048
1554
|
chatCtx.insert(toolMessages);
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
stepIndex: speechHandle._stepIndex + 1,
|
|
1052
|
-
parent: speechHandle
|
|
1053
|
-
});
|
|
1054
|
-
this.agentSession.emit(
|
|
1055
|
-
AgentSessionEventTypes.SpeechCreated,
|
|
1056
|
-
createSpeechCreatedEvent({
|
|
1057
|
-
userInitiated: false,
|
|
1058
|
-
source: "tool_response",
|
|
1059
|
-
speechHandle: handle
|
|
1060
|
-
})
|
|
1061
|
-
);
|
|
1062
|
-
const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1555
|
+
speechHandle._numSteps += 1;
|
|
1556
|
+
const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1063
1557
|
const toolResponseTask = this.createSpeechTask({
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
)
|
|
1558
|
+
taskFn: () => this.pipelineReplyTask(
|
|
1559
|
+
speechHandle,
|
|
1560
|
+
chatCtx,
|
|
1561
|
+
toolCtx,
|
|
1562
|
+
{ toolChoice: respondToolChoice },
|
|
1563
|
+
replyAbortController,
|
|
1564
|
+
instructions,
|
|
1565
|
+
void 0,
|
|
1566
|
+
toolMessages,
|
|
1567
|
+
hasSpeechMessage ? void 0 : userMetrics
|
|
1075
1568
|
),
|
|
1076
|
-
ownedSpeechHandle:
|
|
1569
|
+
ownedSpeechHandle: speechHandle,
|
|
1077
1570
|
name: "AgentActivity.pipelineReply"
|
|
1078
1571
|
});
|
|
1079
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1080
|
-
this.scheduleSpeech(
|
|
1572
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1573
|
+
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1081
1574
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1082
1575
|
for (const msg of toolMessages) {
|
|
1083
1576
|
msg.createdAt = replyStartedAt;
|
|
1084
1577
|
}
|
|
1085
|
-
|
|
1578
|
+
const toolCallOutputs = toolMessages.filter(
|
|
1579
|
+
(m) => m.type === "function_call_output"
|
|
1580
|
+
);
|
|
1581
|
+
if (toolCallOutputs.length > 0) {
|
|
1582
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1583
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1584
|
+
}
|
|
1086
1585
|
}
|
|
1087
|
-
}
|
|
1586
|
+
};
|
|
1587
|
+
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => tracer.startActiveSpan(
|
|
1588
|
+
async (span) => this._pipelineReplyTaskImpl({
|
|
1589
|
+
speechHandle,
|
|
1590
|
+
chatCtx,
|
|
1591
|
+
toolCtx,
|
|
1592
|
+
modelSettings,
|
|
1593
|
+
replyAbortController,
|
|
1594
|
+
instructions,
|
|
1595
|
+
newMessage,
|
|
1596
|
+
toolsMessages,
|
|
1597
|
+
span,
|
|
1598
|
+
_previousUserMetrics
|
|
1599
|
+
}),
|
|
1600
|
+
{
|
|
1601
|
+
name: "agent_turn",
|
|
1602
|
+
context: this.agentSession.rootSpanContext
|
|
1603
|
+
}
|
|
1604
|
+
);
|
|
1088
1605
|
async realtimeGenerationTask(speechHandle, ev, modelSettings, replyAbortController) {
|
|
1089
|
-
|
|
1606
|
+
return tracer.startActiveSpan(
|
|
1607
|
+
async (span) => this._realtimeGenerationTaskImpl({
|
|
1608
|
+
speechHandle,
|
|
1609
|
+
ev,
|
|
1610
|
+
modelSettings,
|
|
1611
|
+
replyAbortController,
|
|
1612
|
+
span
|
|
1613
|
+
}),
|
|
1614
|
+
{
|
|
1615
|
+
name: "agent_turn",
|
|
1616
|
+
context: this.agentSession.rootSpanContext
|
|
1617
|
+
}
|
|
1618
|
+
);
|
|
1619
|
+
}
|
|
1620
|
+
async _realtimeGenerationTaskImpl({
|
|
1621
|
+
speechHandle,
|
|
1622
|
+
ev,
|
|
1623
|
+
modelSettings,
|
|
1624
|
+
replyAbortController,
|
|
1625
|
+
span
|
|
1626
|
+
}) {
|
|
1627
|
+
var _a;
|
|
1628
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1629
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1630
|
+
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1631
|
+
if (localParticipant) {
|
|
1632
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1633
|
+
}
|
|
1090
1634
|
speechHandleStorage.enterWith(speechHandle);
|
|
1091
1635
|
if (!this.realtimeSession) {
|
|
1092
1636
|
throw new Error("realtime session is not initialized");
|
|
@@ -1094,6 +1638,10 @@ ${instructions}` : instructions,
|
|
|
1094
1638
|
if (!(this.llm instanceof RealtimeModel)) {
|
|
1095
1639
|
throw new Error("llm is not a realtime model");
|
|
1096
1640
|
}
|
|
1641
|
+
span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.llm.model);
|
|
1642
|
+
if (this.realtimeSpans && ev.responseId) {
|
|
1643
|
+
this.realtimeSpans.set(ev.responseId, span);
|
|
1644
|
+
}
|
|
1097
1645
|
this.logger.debug(
|
|
1098
1646
|
{ speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
|
|
1099
1647
|
"realtime generation started"
|
|
@@ -1106,10 +1654,17 @@ ${instructions}` : instructions,
|
|
|
1106
1654
|
if (speechHandle.interrupted) {
|
|
1107
1655
|
return;
|
|
1108
1656
|
}
|
|
1109
|
-
const onFirstFrame = () => {
|
|
1110
|
-
this.agentSession._updateAgentState("speaking"
|
|
1657
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1658
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1659
|
+
startTime: startedSpeakingAt,
|
|
1660
|
+
otelContext: speechHandle._agentTurnContext
|
|
1661
|
+
});
|
|
1111
1662
|
};
|
|
1112
1663
|
const readMessages = async (abortController, outputs) => {
|
|
1664
|
+
var _a2, _b;
|
|
1665
|
+
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
1666
|
+
once: true
|
|
1667
|
+
});
|
|
1113
1668
|
const forwardTasks = [];
|
|
1114
1669
|
try {
|
|
1115
1670
|
for await (const msg of ev.messageStream) {
|
|
@@ -1119,7 +1674,22 @@ ${instructions}` : instructions,
|
|
|
1119
1674
|
);
|
|
1120
1675
|
break;
|
|
1121
1676
|
}
|
|
1122
|
-
const
|
|
1677
|
+
const msgModalities = msg.modalities ? await msg.modalities : void 0;
|
|
1678
|
+
let ttsTextInput = null;
|
|
1679
|
+
let trTextInput;
|
|
1680
|
+
if (msgModalities && !msgModalities.includes("audio") && this.tts) {
|
|
1681
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1682
|
+
this.logger.warn(
|
|
1683
|
+
"text response received from realtime API, falling back to use a TTS model."
|
|
1684
|
+
);
|
|
1685
|
+
}
|
|
1686
|
+
const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
|
|
1687
|
+
ttsTextInput = _ttsTextInput;
|
|
1688
|
+
trTextInput = _trTextInput;
|
|
1689
|
+
} else {
|
|
1690
|
+
trTextInput = msg.textStream;
|
|
1691
|
+
}
|
|
1692
|
+
const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
|
|
1123
1693
|
let textOut = null;
|
|
1124
1694
|
if (trNodeResult) {
|
|
1125
1695
|
const [textForwardTask, _textOut] = performTextForwarding(
|
|
@@ -1132,28 +1702,46 @@ ${instructions}` : instructions,
|
|
|
1132
1702
|
}
|
|
1133
1703
|
let audioOut = null;
|
|
1134
1704
|
if (audioOutput) {
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1705
|
+
let realtimeAudioResult = null;
|
|
1706
|
+
if (ttsTextInput) {
|
|
1707
|
+
const [ttsTask, ttsGenData] = performTTSInference(
|
|
1708
|
+
(...args) => this.agent.ttsNode(...args),
|
|
1709
|
+
ttsTextInput,
|
|
1710
|
+
modelSettings,
|
|
1711
|
+
abortController,
|
|
1712
|
+
(_a2 = this.tts) == null ? void 0 : _a2.model,
|
|
1713
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
1714
|
+
);
|
|
1715
|
+
tasks.push(ttsTask);
|
|
1716
|
+
realtimeAudioResult = ttsGenData.audioStream;
|
|
1717
|
+
} else if (msgModalities && msgModalities.includes("audio")) {
|
|
1718
|
+
realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
|
|
1719
|
+
msg.audioStream,
|
|
1720
|
+
modelSettings
|
|
1721
|
+
);
|
|
1722
|
+
} else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1723
|
+
this.logger.error(
|
|
1724
|
+
"Text message received from Realtime API with audio modality. This usually happens when text chat context is synced to the API. Try to add a TTS model as fallback or use text modality with TTS instead."
|
|
1725
|
+
);
|
|
1726
|
+
} else {
|
|
1727
|
+
this.logger.warn(
|
|
1728
|
+
"audio output is enabled but neither tts nor realtime audio is available"
|
|
1729
|
+
);
|
|
1730
|
+
}
|
|
1731
|
+
if (realtimeAudioResult) {
|
|
1140
1732
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1141
|
-
|
|
1733
|
+
realtimeAudioResult,
|
|
1142
1734
|
audioOutput,
|
|
1143
1735
|
abortController
|
|
1144
1736
|
);
|
|
1145
1737
|
forwardTasks.push(forwardTask);
|
|
1146
1738
|
audioOut = _audioOut;
|
|
1147
|
-
audioOut.firstFrameFut.await.
|
|
1148
|
-
} else {
|
|
1149
|
-
this.logger.warn(
|
|
1150
|
-
"audio output is enabled but neither tts nor realtime audio is available"
|
|
1151
|
-
);
|
|
1739
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
1152
1740
|
}
|
|
1153
1741
|
} else if (textOut) {
|
|
1154
|
-
textOut.firstTextFut.await.
|
|
1742
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
1155
1743
|
}
|
|
1156
|
-
outputs.push([msg.messageId, textOut, audioOut]);
|
|
1744
|
+
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1157
1745
|
}
|
|
1158
1746
|
await waitFor(forwardTasks);
|
|
1159
1747
|
} catch (error) {
|
|
@@ -1166,7 +1754,7 @@ ${instructions}` : instructions,
|
|
|
1166
1754
|
const tasks = [
|
|
1167
1755
|
Task.from(
|
|
1168
1756
|
(controller) => readMessages(controller, messageOutputs),
|
|
1169
|
-
|
|
1757
|
+
void 0,
|
|
1170
1758
|
"AgentActivity.realtime_generation.read_messages"
|
|
1171
1759
|
)
|
|
1172
1760
|
];
|
|
@@ -1194,6 +1782,8 @@ ${instructions}` : instructions,
|
|
|
1194
1782
|
);
|
|
1195
1783
|
const onToolExecutionStarted = (f) => {
|
|
1196
1784
|
speechHandle._itemAdded([f]);
|
|
1785
|
+
this.agent._chatCtx.items.push(f);
|
|
1786
|
+
this.agentSession._toolItemsAdded([f]);
|
|
1197
1787
|
};
|
|
1198
1788
|
const onToolExecutionCompleted = (out) => {
|
|
1199
1789
|
if (out.toolCallOutput) {
|
|
@@ -1213,7 +1803,6 @@ ${instructions}` : instructions,
|
|
|
1213
1803
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
1214
1804
|
if (audioOutput) {
|
|
1215
1805
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1216
|
-
this.agentSession._updateAgentState("listening");
|
|
1217
1806
|
}
|
|
1218
1807
|
if (speechHandle.interrupted) {
|
|
1219
1808
|
this.logger.debug(
|
|
@@ -1223,15 +1812,15 @@ ${instructions}` : instructions,
|
|
|
1223
1812
|
replyAbortController.abort();
|
|
1224
1813
|
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1225
1814
|
if (messageOutputs.length > 0) {
|
|
1226
|
-
const [msgId, textOut, audioOut] = messageOutputs[0];
|
|
1815
|
+
const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0];
|
|
1227
1816
|
let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
|
|
1228
1817
|
if (audioOutput) {
|
|
1229
1818
|
audioOutput.clearBuffer();
|
|
1230
1819
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1231
|
-
let
|
|
1232
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1820
|
+
let playbackPositionInS = playbackEv.playbackPosition;
|
|
1821
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
1233
1822
|
this.logger.info(
|
|
1234
|
-
{ speech_id: speechHandle.id,
|
|
1823
|
+
{ speech_id: speechHandle.id, playbackPositionInS },
|
|
1235
1824
|
"playout interrupted"
|
|
1236
1825
|
);
|
|
1237
1826
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1239,11 +1828,13 @@ ${instructions}` : instructions,
|
|
|
1239
1828
|
}
|
|
1240
1829
|
} else {
|
|
1241
1830
|
forwardedText = "";
|
|
1242
|
-
|
|
1831
|
+
playbackPositionInS = 0;
|
|
1243
1832
|
}
|
|
1244
1833
|
this.realtimeSession.truncate({
|
|
1245
1834
|
messageId: msgId,
|
|
1246
|
-
audioEndMs: Math.floor(
|
|
1835
|
+
audioEndMs: Math.floor(playbackPositionInS * 1e3),
|
|
1836
|
+
modalities: msgModalities,
|
|
1837
|
+
audioTranscript: forwardedText
|
|
1247
1838
|
});
|
|
1248
1839
|
}
|
|
1249
1840
|
if (forwardedText) {
|
|
@@ -1267,7 +1858,7 @@ ${instructions}` : instructions,
|
|
|
1267
1858
|
return;
|
|
1268
1859
|
}
|
|
1269
1860
|
if (messageOutputs.length > 0) {
|
|
1270
|
-
const [msgId, textOut, _] = messageOutputs[0];
|
|
1861
|
+
const [msgId, textOut, _, __] = messageOutputs[0];
|
|
1271
1862
|
const message = ChatMessage.create({
|
|
1272
1863
|
role: "assistant",
|
|
1273
1864
|
content: (textOut == null ? void 0 : textOut.text) || "",
|
|
@@ -1279,12 +1870,16 @@ ${instructions}` : instructions,
|
|
|
1279
1870
|
this.agentSession._conversationItemAdded(message);
|
|
1280
1871
|
}
|
|
1281
1872
|
speechHandle._markGenerationDone();
|
|
1282
|
-
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
1283
|
-
this.agentSession._updateAgentState("thinking");
|
|
1284
|
-
});
|
|
1285
1873
|
await executeToolsTask.result;
|
|
1286
|
-
if (toolOutput.output.length
|
|
1287
|
-
|
|
1874
|
+
if (toolOutput.output.length > 0) {
|
|
1875
|
+
this.agentSession._updateAgentState("thinking");
|
|
1876
|
+
} else if (this.agentSession.agentState === "speaking") {
|
|
1877
|
+
this.agentSession._updateAgentState("listening");
|
|
1878
|
+
}
|
|
1879
|
+
if (toolOutput.output.length === 0) {
|
|
1880
|
+
return;
|
|
1881
|
+
}
|
|
1882
|
+
const { maxToolSteps } = this.agentSession.sessionOptions;
|
|
1288
1883
|
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1289
1884
|
this.logger.warn(
|
|
1290
1885
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
@@ -1292,48 +1887,29 @@ ${instructions}` : instructions,
|
|
|
1292
1887
|
);
|
|
1293
1888
|
return;
|
|
1294
1889
|
}
|
|
1295
|
-
const functionToolsExecutedEvent =
|
|
1296
|
-
functionCalls: [],
|
|
1297
|
-
functionCallOutputs: []
|
|
1298
|
-
});
|
|
1299
|
-
let shouldGenerateToolReply = false;
|
|
1300
|
-
let newAgentTask = null;
|
|
1301
|
-
let ignoreTaskSwitch = false;
|
|
1302
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1303
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1304
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1305
|
-
if (sanitizedOut.replyRequired) {
|
|
1306
|
-
shouldGenerateToolReply = true;
|
|
1307
|
-
}
|
|
1308
|
-
}
|
|
1309
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1310
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1311
|
-
ignoreTaskSwitch = true;
|
|
1312
|
-
}
|
|
1313
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1314
|
-
this.logger.debug(
|
|
1315
|
-
{
|
|
1316
|
-
speechId: speechHandle.id,
|
|
1317
|
-
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1318
|
-
args: sanitizedOut.toolCall.args,
|
|
1319
|
-
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1320
|
-
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1321
|
-
},
|
|
1322
|
-
"Tool call execution finished"
|
|
1323
|
-
);
|
|
1324
|
-
}
|
|
1890
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1325
1891
|
this.agentSession.emit(
|
|
1326
1892
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1327
1893
|
functionToolsExecutedEvent
|
|
1328
1894
|
);
|
|
1329
|
-
let
|
|
1895
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1330
1896
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1331
1897
|
this.agentSession.updateAgent(newAgentTask);
|
|
1332
|
-
|
|
1898
|
+
schedulingPaused = true;
|
|
1333
1899
|
}
|
|
1334
1900
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1901
|
+
while (this.currentSpeech || this.speechQueue.size() > 0) {
|
|
1902
|
+
if (this.currentSpeech && !this.currentSpeech.done() && this.currentSpeech !== speechHandle) {
|
|
1903
|
+
await this.currentSpeech.waitForPlayout();
|
|
1904
|
+
} else {
|
|
1905
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
1906
|
+
}
|
|
1907
|
+
}
|
|
1335
1908
|
const chatCtx = this.realtimeSession.chatCtx.copy();
|
|
1336
1909
|
chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
|
|
1910
|
+
this.agentSession._toolItemsAdded(
|
|
1911
|
+
functionToolsExecutedEvent.functionCallOutputs
|
|
1912
|
+
);
|
|
1337
1913
|
try {
|
|
1338
1914
|
await this.realtimeSession.updateChatCtx(chatCtx);
|
|
1339
1915
|
} catch (error) {
|
|
@@ -1360,20 +1936,58 @@ ${instructions}` : instructions,
|
|
|
1360
1936
|
speechHandle: replySpeechHandle
|
|
1361
1937
|
})
|
|
1362
1938
|
);
|
|
1363
|
-
const toolChoice =
|
|
1939
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1364
1940
|
this.createSpeechTask({
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
})
|
|
1371
|
-
),
|
|
1941
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
1942
|
+
speechHandle: replySpeechHandle,
|
|
1943
|
+
modelSettings: { toolChoice },
|
|
1944
|
+
abortController
|
|
1945
|
+
}),
|
|
1372
1946
|
ownedSpeechHandle: replySpeechHandle,
|
|
1373
1947
|
name: "AgentActivity.realtime_reply"
|
|
1374
1948
|
});
|
|
1375
1949
|
this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1376
1950
|
}
|
|
1951
|
+
summarizeToolExecutionOutput(toolOutput, speechHandle) {
|
|
1952
|
+
var _a, _b, _c;
|
|
1953
|
+
const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
|
|
1954
|
+
functionCalls: [],
|
|
1955
|
+
functionCallOutputs: []
|
|
1956
|
+
});
|
|
1957
|
+
let shouldGenerateToolReply = false;
|
|
1958
|
+
let newAgentTask = null;
|
|
1959
|
+
let ignoreTaskSwitch = false;
|
|
1960
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
1961
|
+
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1962
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1963
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1964
|
+
if (sanitizedOut.replyRequired) {
|
|
1965
|
+
shouldGenerateToolReply = true;
|
|
1966
|
+
}
|
|
1967
|
+
}
|
|
1968
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1969
|
+
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1970
|
+
ignoreTaskSwitch = true;
|
|
1971
|
+
}
|
|
1972
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1973
|
+
this.logger.debug(
|
|
1974
|
+
{
|
|
1975
|
+
speechId: speechHandle.id,
|
|
1976
|
+
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1977
|
+
args: sanitizedOut.toolCall.args,
|
|
1978
|
+
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1979
|
+
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1980
|
+
},
|
|
1981
|
+
"Tool call execution finished"
|
|
1982
|
+
);
|
|
1983
|
+
}
|
|
1984
|
+
return {
|
|
1985
|
+
functionToolsExecutedEvent,
|
|
1986
|
+
shouldGenerateToolReply,
|
|
1987
|
+
newAgentTask,
|
|
1988
|
+
ignoreTaskSwitch
|
|
1989
|
+
};
|
|
1990
|
+
}
|
|
1377
1991
|
async realtimeReplyTask({
|
|
1378
1992
|
speechHandle,
|
|
1379
1993
|
modelSettings: { toolChoice },
|
|
@@ -1415,71 +2029,193 @@ ${instructions}` : instructions,
|
|
|
1415
2029
|
}
|
|
1416
2030
|
}
|
|
1417
2031
|
scheduleSpeech(speechHandle, priority, force = false) {
|
|
1418
|
-
if (this.
|
|
1419
|
-
throw new Error("cannot schedule new speech, the
|
|
2032
|
+
if (this.schedulingPaused && !force) {
|
|
2033
|
+
throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
|
|
1420
2034
|
}
|
|
1421
2035
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1422
2036
|
speechHandle._markScheduled();
|
|
1423
2037
|
this.wakeupMainTask();
|
|
1424
2038
|
}
|
|
2039
|
+
async _pauseSchedulingTask(blockedTasks) {
|
|
2040
|
+
if (this._schedulingPaused) return;
|
|
2041
|
+
this._schedulingPaused = true;
|
|
2042
|
+
this._drainBlockedTasks = blockedTasks;
|
|
2043
|
+
this.wakeupMainTask();
|
|
2044
|
+
if (this._mainTask) {
|
|
2045
|
+
await this._mainTask.result;
|
|
2046
|
+
}
|
|
2047
|
+
}
|
|
2048
|
+
_resumeSchedulingTask() {
|
|
2049
|
+
if (!this._schedulingPaused) return;
|
|
2050
|
+
this._schedulingPaused = false;
|
|
2051
|
+
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
2052
|
+
}
|
|
2053
|
+
async pause(options = {}) {
|
|
2054
|
+
const { blockedTasks = [] } = options;
|
|
2055
|
+
const unlock = await this.lock.lock();
|
|
2056
|
+
try {
|
|
2057
|
+
const span = tracer.startSpan({
|
|
2058
|
+
name: "pause_agent_activity",
|
|
2059
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
2060
|
+
});
|
|
2061
|
+
try {
|
|
2062
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
2063
|
+
await this._closeSessionResources();
|
|
2064
|
+
} finally {
|
|
2065
|
+
span.end();
|
|
2066
|
+
}
|
|
2067
|
+
} finally {
|
|
2068
|
+
unlock();
|
|
2069
|
+
}
|
|
2070
|
+
}
|
|
1425
2071
|
async drain() {
|
|
1426
|
-
|
|
2072
|
+
return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
2073
|
+
name: "drain_agent_activity",
|
|
2074
|
+
context: ROOT_CONTEXT
|
|
2075
|
+
});
|
|
2076
|
+
}
|
|
2077
|
+
async _drainImpl(span) {
|
|
2078
|
+
span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
|
|
1427
2079
|
const unlock = await this.lock.lock();
|
|
1428
2080
|
try {
|
|
1429
|
-
if (this.
|
|
1430
|
-
this.createSpeechTask({
|
|
1431
|
-
|
|
2081
|
+
if (this._schedulingPaused) return;
|
|
2082
|
+
this._onExitTask = this.createSpeechTask({
|
|
2083
|
+
taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
2084
|
+
name: "on_exit",
|
|
2085
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
2086
|
+
}),
|
|
2087
|
+
inlineTask: true,
|
|
1432
2088
|
name: "AgentActivity_onExit"
|
|
1433
2089
|
});
|
|
1434
|
-
this.
|
|
1435
|
-
this.
|
|
1436
|
-
await
|
|
2090
|
+
this.cancelPreemptiveGeneration();
|
|
2091
|
+
await this._onExitTask.result;
|
|
2092
|
+
await this._pauseSchedulingTask([]);
|
|
1437
2093
|
} finally {
|
|
1438
2094
|
unlock();
|
|
1439
2095
|
}
|
|
1440
2096
|
}
|
|
1441
2097
|
async close() {
|
|
1442
|
-
var _a, _b, _c;
|
|
1443
2098
|
const unlock = await this.lock.lock();
|
|
1444
2099
|
try {
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1450
|
-
}
|
|
1451
|
-
if (this.realtimeSession) {
|
|
1452
|
-
this.realtimeSession.off("generation_created", this.onGenerationCreated);
|
|
1453
|
-
this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
|
|
1454
|
-
this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
|
|
1455
|
-
this.realtimeSession.off(
|
|
1456
|
-
"input_audio_transcription_completed",
|
|
1457
|
-
this.onInputAudioTranscriptionCompleted
|
|
1458
|
-
);
|
|
1459
|
-
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
2100
|
+
this.cancelPreemptiveGeneration();
|
|
2101
|
+
await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
2102
|
+
if (this._currentSpeech && !this._currentSpeech.done()) {
|
|
2103
|
+
this._currentSpeech._markDone();
|
|
1460
2104
|
}
|
|
1461
|
-
|
|
1462
|
-
|
|
2105
|
+
await this._closeSessionResources();
|
|
2106
|
+
if (this._mainTask) {
|
|
2107
|
+
await this._mainTask.cancelAndWait();
|
|
1463
2108
|
}
|
|
1464
|
-
if (this.
|
|
1465
|
-
this.
|
|
2109
|
+
if (this.interruptionDetector) {
|
|
2110
|
+
this.interruptionDetector.off("overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2111
|
+
this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2112
|
+
this.interruptionDetector.off("error", this.onInterruptionError);
|
|
1466
2113
|
}
|
|
1467
|
-
|
|
1468
|
-
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1469
|
-
}
|
|
1470
|
-
this.detachAudioInput();
|
|
1471
|
-
await ((_a = this.realtimeSession) == null ? void 0 : _a.close());
|
|
1472
|
-
await ((_b = this.audioRecognition) == null ? void 0 : _b.close());
|
|
1473
|
-
await ((_c = this._mainTask) == null ? void 0 : _c.cancelAndWait());
|
|
2114
|
+
this.agent._agentActivity = void 0;
|
|
1474
2115
|
} finally {
|
|
1475
2116
|
unlock();
|
|
1476
2117
|
}
|
|
1477
2118
|
}
|
|
2119
|
+
resolveInterruptionDetector() {
|
|
2120
|
+
var _a, _b;
|
|
2121
|
+
const agentInterruptionDetection = (_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.interruption) == null ? void 0 : _b.mode;
|
|
2122
|
+
const sessionInterruptionDetection = this.agentSession.interruptionDetection;
|
|
2123
|
+
if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof RealtimeModel))) {
|
|
2124
|
+
if (agentInterruptionDetection === "adaptive" || sessionInterruptionDetection === "adaptive") {
|
|
2125
|
+
this.logger.warn(
|
|
2126
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
|
|
2127
|
+
);
|
|
2128
|
+
}
|
|
2129
|
+
return void 0;
|
|
2130
|
+
}
|
|
2131
|
+
if (!this.allowInterruptions) {
|
|
2132
|
+
return void 0;
|
|
2133
|
+
}
|
|
2134
|
+
if (agentInterruptionDetection === "vad") {
|
|
2135
|
+
return void 0;
|
|
2136
|
+
}
|
|
2137
|
+
if (sessionInterruptionDetection === "vad") {
|
|
2138
|
+
return void 0;
|
|
2139
|
+
}
|
|
2140
|
+
if (agentInterruptionDetection === void 0 && sessionInterruptionDetection === void 0 && !isHosted() && !isDevMode()) {
|
|
2141
|
+
this.logger.info("adaptive interruption is disabled by default in production mode");
|
|
2142
|
+
return void 0;
|
|
2143
|
+
}
|
|
2144
|
+
try {
|
|
2145
|
+
const detector = new AdaptiveInterruptionDetector();
|
|
2146
|
+
detector.on("overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2147
|
+
detector.on("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2148
|
+
detector.on("error", this.onInterruptionError);
|
|
2149
|
+
return detector;
|
|
2150
|
+
} catch (error) {
|
|
2151
|
+
this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
|
|
2152
|
+
}
|
|
2153
|
+
return void 0;
|
|
2154
|
+
}
|
|
2155
|
+
restoreInterruptionByAudioActivity() {
|
|
2156
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2157
|
+
}
|
|
2158
|
+
fallbackToVadInterruption() {
|
|
2159
|
+
if (!this.isInterruptionDetectionEnabled) return;
|
|
2160
|
+
this.isInterruptionDetectionEnabled = false;
|
|
2161
|
+
this.restoreInterruptionByAudioActivity();
|
|
2162
|
+
if (this.interruptionDetector) {
|
|
2163
|
+
this.interruptionDetector.off("overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2164
|
+
this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2165
|
+
this.interruptionDetector.off("error", this.onInterruptionError);
|
|
2166
|
+
this.interruptionDetector = void 0;
|
|
2167
|
+
}
|
|
2168
|
+
if (this.audioRecognition) {
|
|
2169
|
+
this.audioRecognition.disableInterruptionDetection().catch((err) => {
|
|
2170
|
+
this.logger.warn({ err }, "error while disabling interruption detection");
|
|
2171
|
+
});
|
|
2172
|
+
}
|
|
2173
|
+
this.logger.warn(
|
|
2174
|
+
"adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption"
|
|
2175
|
+
);
|
|
2176
|
+
}
|
|
2177
|
+
async _closeSessionResources() {
|
|
2178
|
+
var _a, _b, _c;
|
|
2179
|
+
if (this.llm instanceof LLM) {
|
|
2180
|
+
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
2181
|
+
this.llm.off("error", this.onModelError);
|
|
2182
|
+
}
|
|
2183
|
+
if (this.realtimeSession) {
|
|
2184
|
+
this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
|
|
2185
|
+
this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
2186
|
+
this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
2187
|
+
this.realtimeSession.off(
|
|
2188
|
+
"input_audio_transcription_completed",
|
|
2189
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
2190
|
+
);
|
|
2191
|
+
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
2192
|
+
this.realtimeSession.off("error", this.onModelError);
|
|
2193
|
+
}
|
|
2194
|
+
if (this.stt instanceof STT) {
|
|
2195
|
+
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
2196
|
+
this.stt.off("error", this.onModelError);
|
|
2197
|
+
}
|
|
2198
|
+
if (this.tts instanceof TTS) {
|
|
2199
|
+
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
2200
|
+
this.tts.off("error", this.onModelError);
|
|
2201
|
+
}
|
|
2202
|
+
if (this.vad instanceof VAD) {
|
|
2203
|
+
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
2204
|
+
}
|
|
2205
|
+
this.detachAudioInput();
|
|
2206
|
+
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
2207
|
+
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
2208
|
+
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
2209
|
+
this.realtimeSession = void 0;
|
|
2210
|
+
this.audioRecognition = void 0;
|
|
2211
|
+
}
|
|
1478
2212
|
}
|
|
1479
2213
|
function toOaiToolChoice(toolChoice) {
|
|
1480
2214
|
return toolChoice !== null ? toolChoice : void 0;
|
|
1481
2215
|
}
|
|
1482
2216
|
export {
|
|
1483
|
-
AgentActivity
|
|
2217
|
+
AgentActivity,
|
|
2218
|
+
agentActivityStorage,
|
|
2219
|
+
onEnterStorage
|
|
1484
2220
|
};
|
|
1485
2221
|
//# sourceMappingURL=agent_activity.js.map
|