@livekit/agents 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio.cjs +89 -3
- package/dist/audio.cjs.map +1 -1
- package/dist/audio.d.cts +36 -1
- package/dist/audio.d.ts +36 -1
- package/dist/audio.d.ts.map +1 -1
- package/dist/audio.js +76 -2
- package/dist/audio.js.map +1 -1
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +162 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +138 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/cli.cjs +44 -46
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +45 -47
- package/dist/cli.js.map +1 -1
- package/dist/connection_pool.cjs +242 -0
- package/dist/connection_pool.cjs.map +1 -0
- package/dist/connection_pool.d.cts +123 -0
- package/dist/connection_pool.d.ts +123 -0
- package/dist/connection_pool.d.ts.map +1 -0
- package/dist/connection_pool.js +218 -0
- package/dist/connection_pool.js.map +1 -0
- package/dist/connection_pool.test.cjs +256 -0
- package/dist/connection_pool.test.cjs.map +1 -0
- package/dist/connection_pool.test.js +255 -0
- package/dist/connection_pool.test.js.map +1 -0
- package/dist/constants.cjs +30 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +10 -0
- package/dist/constants.d.ts +10 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +20 -0
- package/dist/constants.js.map +1 -1
- package/dist/cpu.cjs +189 -0
- package/dist/cpu.cjs.map +1 -0
- package/dist/cpu.d.cts +24 -0
- package/dist/cpu.d.ts +24 -0
- package/dist/cpu.d.ts.map +1 -0
- package/dist/cpu.js +152 -0
- package/dist/cpu.js.map +1 -0
- package/dist/cpu.test.cjs +227 -0
- package/dist/cpu.test.cjs.map +1 -0
- package/dist/cpu.test.js +204 -0
- package/dist/cpu.test.js.map +1 -0
- package/dist/http_server.cjs +9 -6
- package/dist/http_server.cjs.map +1 -1
- package/dist/http_server.d.cts +5 -1
- package/dist/http_server.d.ts +5 -1
- package/dist/http_server.d.ts.map +1 -1
- package/dist/http_server.js +9 -6
- package/dist/http_server.js.map +1 -1
- package/dist/index.cjs +24 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +15 -11
- package/dist/index.d.ts +15 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +18 -9
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.cjs +70 -2
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +373 -32
- package/dist/inference/api_protos.d.ts +373 -32
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +62 -2
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/index.cjs +8 -0
- package/dist/inference/index.cjs.map +1 -1
- package/dist/inference/index.d.cts +3 -4
- package/dist/inference/index.d.ts +3 -4
- package/dist/inference/index.d.ts.map +1 -1
- package/dist/inference/index.js +18 -3
- package/dist/inference/index.js.map +1 -1
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +163 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +137 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +198 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +164 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +342 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +308 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +106 -66
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +65 -43
- package/dist/inference/llm.d.ts +65 -43
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +100 -66
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +319 -170
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +64 -15
- package/dist/inference/stt.d.ts +64 -15
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +319 -170
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/stt.test.cjs +218 -0
- package/dist/inference/stt.test.cjs.map +1 -0
- package/dist/inference/stt.test.js +217 -0
- package/dist/inference/stt.test.js.map +1 -0
- package/dist/inference/tts.cjs +249 -71
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +55 -16
- package/dist/inference/tts.d.ts +55 -16
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +249 -77
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/tts.test.cjs +233 -0
- package/dist/inference/tts.test.cjs.map +1 -0
- package/dist/inference/tts.test.js +232 -0
- package/dist/inference/tts.test.js.map +1 -0
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +14 -1
- package/dist/inference/utils.d.ts +14 -1
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs +6 -3
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
- package/dist/ipc/inference_proc_executor.js +6 -3
- package/dist/ipc/inference_proc_executor.js.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.js +13 -1
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
- package/dist/ipc/job_proc_executor.cjs +6 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.d.ts.map +1 -1
- package/dist/ipc/job_proc_executor.js +6 -1
- package/dist/ipc/job_proc_executor.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +89 -17
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +68 -18
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.cjs +34 -8
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.d.cts +8 -0
- package/dist/ipc/supervised_proc.d.ts +8 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/ipc/supervised_proc.js +34 -8
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/ipc/supervised_proc.test.cjs +145 -0
- package/dist/ipc/supervised_proc.test.cjs.map +1 -0
- package/dist/ipc/supervised_proc.test.js +122 -0
- package/dist/ipc/supervised_proc.test.js.map +1 -0
- package/dist/job.cjs +109 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +14 -0
- package/dist/job.d.ts +14 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +99 -1
- package/dist/job.js.map +1 -1
- package/dist/language.cjs +394 -0
- package/dist/language.cjs.map +1 -0
- package/dist/language.d.cts +15 -0
- package/dist/language.d.ts +15 -0
- package/dist/language.d.ts.map +1 -0
- package/dist/language.js +363 -0
- package/dist/language.js.map +1 -0
- package/dist/language.test.cjs +43 -0
- package/dist/language.test.cjs.map +1 -0
- package/dist/language.test.js +49 -0
- package/dist/language.test.js.map +1 -0
- package/dist/llm/chat_context.cjs +274 -3
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +86 -2
- package/dist/llm/chat_context.d.ts +86 -2
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +273 -3
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +574 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +574 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/fallback_adapter.cjs +278 -0
- package/dist/llm/fallback_adapter.cjs.map +1 -0
- package/dist/llm/fallback_adapter.d.cts +73 -0
- package/dist/llm/fallback_adapter.d.ts +73 -0
- package/dist/llm/fallback_adapter.d.ts.map +1 -0
- package/dist/llm/fallback_adapter.js +254 -0
- package/dist/llm/fallback_adapter.js.map +1 -0
- package/dist/llm/fallback_adapter.test.cjs +176 -0
- package/dist/llm/fallback_adapter.test.cjs.map +1 -0
- package/dist/llm/fallback_adapter.test.js +175 -0
- package/dist/llm/fallback_adapter.test.js.map +1 -0
- package/dist/llm/index.cjs +9 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +4 -3
- package/dist/llm/index.d.ts +4 -3
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +11 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +65 -11
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +13 -2
- package/dist/llm/llm.d.ts +13 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +65 -11
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/google.cjs +6 -2
- package/dist/llm/provider_format/google.cjs.map +1 -1
- package/dist/llm/provider_format/google.d.cts +1 -1
- package/dist/llm/provider_format/google.d.ts +1 -1
- package/dist/llm/provider_format/google.d.ts.map +1 -1
- package/dist/llm/provider_format/google.js +6 -2
- package/dist/llm/provider_format/google.js.map +1 -1
- package/dist/llm/provider_format/google.test.cjs +48 -0
- package/dist/llm/provider_format/google.test.cjs.map +1 -1
- package/dist/llm/provider_format/google.test.js +54 -1
- package/dist/llm/provider_format/google.test.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +2 -2
- package/dist/llm/provider_format/index.d.ts +2 -2
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +126 -24
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +124 -23
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +393 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +400 -2
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +5 -4
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +5 -4
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +15 -1
- package/dist/llm/realtime.d.ts +15 -1
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/remote_chat_context.cjs.map +1 -1
- package/dist/llm/remote_chat_context.d.cts +2 -0
- package/dist/llm/remote_chat_context.d.ts +2 -0
- package/dist/llm/remote_chat_context.d.ts.map +1 -1
- package/dist/llm/remote_chat_context.js.map +1 -1
- package/dist/llm/tool_context.cjs +50 -2
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +47 -11
- package/dist/llm/tool_context.d.ts +47 -11
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +48 -3
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/llm/tool_context.test.cjs +197 -0
- package/dist/llm/tool_context.test.cjs.map +1 -1
- package/dist/llm/tool_context.test.js +175 -0
- package/dist/llm/tool_context.test.js.map +1 -1
- package/dist/llm/utils.cjs +18 -12
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +2 -3
- package/dist/llm/utils.d.ts +2 -3
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +18 -12
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/zod-utils.cjs +102 -0
- package/dist/llm/zod-utils.cjs.map +1 -0
- package/dist/llm/zod-utils.d.cts +65 -0
- package/dist/llm/zod-utils.d.ts +65 -0
- package/dist/llm/zod-utils.d.ts.map +1 -0
- package/dist/llm/zod-utils.js +64 -0
- package/dist/llm/zod-utils.js.map +1 -0
- package/dist/llm/zod-utils.test.cjs +472 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -0
- package/dist/llm/zod-utils.test.js +455 -0
- package/dist/llm/zod-utils.test.js.map +1 -0
- package/dist/log.cjs +45 -14
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.cts +8 -1
- package/dist/log.d.ts +8 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +45 -15
- package/dist/log.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +75 -19
- package/dist/metrics/base.d.ts +75 -19
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +5 -2
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +10 -1
- package/dist/metrics/usage_collector.d.ts +10 -1
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +5 -2
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +23 -7
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +23 -7
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +31 -10
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.cts +6 -1
- package/dist/stream/deferred_stream.d.ts +6 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +31 -10
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/deferred_stream.test.cjs +2 -2
- package/dist/stream/deferred_stream.test.cjs.map +1 -1
- package/dist/stream/deferred_stream.test.js +2 -2
- package/dist/stream/deferred_stream.test.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +344 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +343 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/stream/stream_channel.cjs +39 -1
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +5 -2
- package/dist/stream/stream_channel.d.ts +5 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +39 -1
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stream/stream_channel.test.cjs +27 -0
- package/dist/stream/stream_channel.test.cjs.map +1 -1
- package/dist/stream/stream_channel.test.js +27 -0
- package/dist/stream/stream_channel.test.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +24 -9
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.cts +7 -3
- package/dist/stt/stream_adapter.d.ts +7 -3
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +24 -9
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +86 -19
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +60 -5
- package/dist/stt/stt.d.ts +60 -5
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +88 -21
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/index.cjs +72 -0
- package/dist/telemetry/index.cjs.map +1 -0
- package/dist/telemetry/index.d.cts +7 -0
- package/dist/telemetry/index.d.ts +7 -0
- package/dist/telemetry/index.d.ts.map +1 -0
- package/dist/telemetry/index.js +37 -0
- package/dist/telemetry/index.js.map +1 -0
- package/dist/telemetry/logging.cjs +65 -0
- package/dist/telemetry/logging.cjs.map +1 -0
- package/dist/telemetry/logging.d.cts +21 -0
- package/dist/telemetry/logging.d.ts +21 -0
- package/dist/telemetry/logging.d.ts.map +1 -0
- package/dist/telemetry/logging.js +40 -0
- package/dist/telemetry/logging.js.map +1 -0
- package/dist/telemetry/otel_http_exporter.cjs +166 -0
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
- package/dist/telemetry/otel_http_exporter.d.cts +63 -0
- package/dist/telemetry/otel_http_exporter.d.ts +63 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
- package/dist/telemetry/otel_http_exporter.js +142 -0
- package/dist/telemetry/otel_http_exporter.js.map +1 -0
- package/dist/telemetry/pino_otel_transport.cjs +217 -0
- package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
- package/dist/telemetry/pino_otel_transport.d.cts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
- package/dist/telemetry/pino_otel_transport.js +189 -0
- package/dist/telemetry/pino_otel_transport.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +233 -0
- package/dist/telemetry/trace_types.cjs.map +1 -0
- package/dist/telemetry/trace_types.d.cts +74 -0
- package/dist/telemetry/trace_types.d.ts +74 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -0
- package/dist/telemetry/trace_types.js +141 -0
- package/dist/telemetry/trace_types.js.map +1 -0
- package/dist/telemetry/traces.cjs +484 -0
- package/dist/telemetry/traces.cjs.map +1 -0
- package/dist/telemetry/traces.d.cts +116 -0
- package/dist/telemetry/traces.d.ts +116 -0
- package/dist/telemetry/traces.d.ts.map +1 -0
- package/dist/telemetry/traces.js +449 -0
- package/dist/telemetry/traces.js.map +1 -0
- package/dist/telemetry/utils.cjs +86 -0
- package/dist/telemetry/utils.cjs.map +1 -0
- package/dist/telemetry/utils.d.cts +5 -0
- package/dist/telemetry/utils.d.ts +5 -0
- package/dist/telemetry/utils.d.ts.map +1 -0
- package/dist/telemetry/utils.js +51 -0
- package/dist/telemetry/utils.js.map +1 -0
- package/dist/tokenize/basic/sentence.cjs +3 -3
- package/dist/tokenize/basic/sentence.cjs.map +1 -1
- package/dist/tokenize/basic/sentence.js +3 -3
- package/dist/tokenize/basic/sentence.js.map +1 -1
- package/dist/tokenize/tokenizer.test.cjs +3 -1
- package/dist/tokenize/tokenizer.test.cjs.map +1 -1
- package/dist/tokenize/tokenizer.test.js +3 -1
- package/dist/tokenize/tokenizer.test.js.map +1 -1
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.cts +6 -0
- package/dist/transcription.d.ts +6 -0
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js.map +1 -1
- package/dist/tts/fallback_adapter.cjs +466 -0
- package/dist/tts/fallback_adapter.cjs.map +1 -0
- package/dist/tts/fallback_adapter.d.cts +110 -0
- package/dist/tts/fallback_adapter.d.ts +110 -0
- package/dist/tts/fallback_adapter.d.ts.map +1 -0
- package/dist/tts/fallback_adapter.js +442 -0
- package/dist/tts/fallback_adapter.js.map +1 -0
- package/dist/tts/index.cjs +3 -0
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +1 -0
- package/dist/tts/index.d.ts +1 -0
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +2 -0
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +25 -8
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +6 -3
- package/dist/tts/stream_adapter.d.ts +6 -3
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +25 -8
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +189 -57
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +58 -6
- package/dist/tts/tts.d.ts +58 -6
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +191 -59
- package/dist/tts/tts.js.map +1 -1
- package/dist/types.cjs +24 -32
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +45 -10
- package/dist/types.d.ts +45 -10
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +20 -30
- package/dist/types.js.map +1 -1
- package/dist/utils.cjs +122 -26
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +41 -1
- package/dist/utils.d.ts +41 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +117 -25
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +73 -1
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +74 -10
- package/dist/utils.test.js.map +1 -1
- package/dist/vad.cjs +35 -15
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +15 -5
- package/dist/vad.d.ts +15 -5
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +35 -15
- package/dist/vad.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +258 -35
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +54 -13
- package/dist/voice/agent.d.ts +54 -13
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +254 -34
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +314 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +316 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +1116 -385
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +72 -11
- package/dist/voice/agent_activity.d.ts +72 -11
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +1119 -383
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_activity.test.cjs +135 -0
- package/dist/voice/agent_activity.test.cjs.map +1 -0
- package/dist/voice/agent_activity.test.js +134 -0
- package/dist/voice/agent_activity.test.js.map +1 -0
- package/dist/voice/agent_session.cjs +550 -90
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +185 -25
- package/dist/voice/agent_session.d.ts +185 -25
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +556 -91
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +605 -46
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +96 -4
- package/dist/voice/audio_recognition.d.ts +96 -4
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +611 -47
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +295 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +299 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/avatar/datastream_io.cjs +7 -1
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +7 -1
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs +367 -0
- package/dist/voice/background_audio.cjs.map +1 -0
- package/dist/voice/background_audio.d.cts +123 -0
- package/dist/voice/background_audio.d.ts +123 -0
- package/dist/voice/background_audio.d.ts.map +1 -0
- package/dist/voice/background_audio.js +343 -0
- package/dist/voice/background_audio.js.map +1 -0
- package/dist/voice/events.cjs +3 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +16 -9
- package/dist/voice/events.d.ts +16 -9
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +3 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +205 -41
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +21 -5
- package/dist/voice/generation.d.ts +21 -5
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +215 -43
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/generation_tools.test.cjs +236 -0
- package/dist/voice/generation_tools.test.cjs.map +1 -0
- package/dist/voice/generation_tools.test.js +235 -0
- package/dist/voice/generation_tools.test.js.map +1 -0
- package/dist/voice/index.cjs +33 -2
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +8 -2
- package/dist/voice/index.d.ts +8 -2
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +19 -2
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/interruption_detection.test.cjs +114 -0
- package/dist/voice/interruption_detection.test.cjs.map +1 -0
- package/dist/voice/interruption_detection.test.js +113 -0
- package/dist/voice/interruption_detection.test.js.map +1 -0
- package/dist/voice/io.cjs +66 -6
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +67 -7
- package/dist/voice/io.d.ts +67 -7
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +62 -5
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/index.cjs +23 -0
- package/dist/voice/recorder_io/index.cjs.map +1 -0
- package/dist/voice/recorder_io/index.d.cts +2 -0
- package/dist/voice/recorder_io/index.d.ts +2 -0
- package/dist/voice/recorder_io/index.d.ts.map +1 -0
- package/dist/voice/recorder_io/index.js +2 -0
- package/dist/voice/recorder_io/index.js.map +1 -0
- package/dist/voice/recorder_io/recorder_io.cjs +607 -0
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
- package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
- package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
- package/dist/voice/recorder_io/recorder_io.js +573 -0
- package/dist/voice/recorder_io/recorder_io.js.map +1 -0
- package/dist/voice/remote_session.cjs +922 -0
- package/dist/voice/remote_session.cjs.map +1 -0
- package/dist/voice/remote_session.d.cts +108 -0
- package/dist/voice/remote_session.d.ts +108 -0
- package/dist/voice/remote_session.d.ts.map +1 -0
- package/dist/voice/remote_session.js +887 -0
- package/dist/voice/remote_session.js.map +1 -0
- package/dist/voice/report.cjs +88 -0
- package/dist/voice/report.cjs.map +1 -0
- package/dist/voice/report.d.cts +49 -0
- package/dist/voice/report.d.ts +49 -0
- package/dist/voice/report.d.ts.map +1 -0
- package/dist/voice/report.js +63 -0
- package/dist/voice/report.js.map +1 -0
- package/dist/voice/report.test.cjs +121 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +120 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/_input.cjs +40 -7
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +5 -2
- package/dist/voice/room_io/_input.d.ts +5 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +41 -8
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +19 -11
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +7 -4
- package/dist/voice/room_io/_output.d.ts +7 -4
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +20 -12
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +33 -6
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +29 -9
- package/dist/voice/room_io/room_io.d.ts +29 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +33 -7
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +22 -4
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +17 -2
- package/dist/voice/speech_handle.d.ts +17 -2
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +21 -4
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +57 -0
- package/dist/voice/testing/index.cjs.map +1 -0
- package/dist/voice/testing/index.d.cts +21 -0
- package/dist/voice/testing/index.d.ts +21 -0
- package/dist/voice/testing/index.d.ts.map +1 -0
- package/dist/voice/testing/index.js +35 -0
- package/dist/voice/testing/index.js.map +1 -0
- package/dist/voice/testing/run_result.cjs +817 -0
- package/dist/voice/testing/run_result.cjs.map +1 -0
- package/dist/voice/testing/run_result.d.cts +385 -0
- package/dist/voice/testing/run_result.d.ts +385 -0
- package/dist/voice/testing/run_result.d.ts.map +1 -0
- package/dist/voice/testing/run_result.js +790 -0
- package/dist/voice/testing/run_result.js.map +1 -0
- package/dist/voice/testing/types.cjs +46 -0
- package/dist/voice/testing/types.cjs.map +1 -0
- package/dist/voice/testing/types.d.cts +83 -0
- package/dist/voice/testing/types.d.ts +83 -0
- package/dist/voice/testing/types.d.ts.map +1 -0
- package/dist/voice/testing/types.js +19 -0
- package/dist/voice/testing/types.js.map +1 -0
- package/dist/voice/transcription/synchronizer.cjs +139 -15
- package/dist/voice/transcription/synchronizer.cjs.map +1 -1
- package/dist/voice/transcription/synchronizer.d.cts +35 -4
- package/dist/voice/transcription/synchronizer.d.ts +35 -4
- package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
- package/dist/voice/transcription/synchronizer.js +143 -16
- package/dist/voice/transcription/synchronizer.js.map +1 -1
- package/dist/voice/transcription/synchronizer.test.cjs +151 -0
- package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
- package/dist/voice/transcription/synchronizer.test.js +150 -0
- package/dist/voice/transcription/synchronizer.test.js.map +1 -0
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +157 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +37 -0
- package/dist/voice/turn_config/utils.d.ts +37 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +131 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +128 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +127 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/dist/worker.cjs +44 -52
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +18 -8
- package/dist/worker.d.ts +18 -8
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +43 -43
- package/dist/worker.js.map +1 -1
- package/package.json +32 -12
- package/resources/NOTICE +2 -0
- package/resources/keyboard-typing.ogg +0 -0
- package/resources/keyboard-typing2.ogg +0 -0
- package/resources/office-ambience.ogg +0 -0
- package/src/audio.ts +132 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +194 -0
- package/src/cli.ts +57 -66
- package/src/connection_pool.test.ts +346 -0
- package/src/connection_pool.ts +307 -0
- package/src/constants.ts +14 -0
- package/src/cpu.test.ts +239 -0
- package/src/cpu.ts +173 -0
- package/src/http_server.ts +18 -6
- package/src/index.ts +15 -13
- package/src/inference/api_protos.ts +85 -2
- package/src/inference/index.ts +32 -4
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +206 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +204 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +406 -0
- package/src/inference/llm.ts +214 -163
- package/src/inference/stt.test.ts +253 -0
- package/src/inference/stt.ts +449 -208
- package/src/inference/tts.test.ts +267 -0
- package/src/inference/tts.ts +377 -115
- package/src/inference/utils.ts +30 -2
- package/src/ipc/inference_proc_executor.ts +11 -3
- package/src/ipc/inference_proc_lazy_main.ts +13 -1
- package/src/ipc/job_proc_executor.ts +11 -1
- package/src/ipc/job_proc_lazy_main.ts +86 -20
- package/src/ipc/supervised_proc.test.ts +153 -0
- package/src/ipc/supervised_proc.ts +39 -10
- package/src/job.ts +120 -1
- package/src/language.test.ts +62 -0
- package/src/language.ts +380 -0
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
- package/src/llm/chat_context.test.ts +655 -0
- package/src/llm/chat_context.ts +412 -2
- package/src/llm/fallback_adapter.test.ts +238 -0
- package/src/llm/fallback_adapter.ts +391 -0
- package/src/llm/index.ts +11 -0
- package/src/llm/llm.ts +77 -12
- package/src/llm/provider_format/google.test.ts +72 -1
- package/src/llm/provider_format/google.ts +10 -6
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +480 -2
- package/src/llm/provider_format/openai.ts +152 -21
- package/src/llm/provider_format/utils.ts +11 -5
- package/src/llm/realtime.ts +23 -2
- package/src/llm/remote_chat_context.ts +2 -2
- package/src/llm/tool_context.test.ts +210 -1
- package/src/llm/tool_context.ts +115 -17
- package/src/llm/utils.ts +24 -16
- package/src/llm/zod-utils.test.ts +577 -0
- package/src/llm/zod-utils.ts +153 -0
- package/src/log.ts +71 -19
- package/src/metrics/base.ts +78 -19
- package/src/metrics/index.ts +12 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +14 -3
- package/src/metrics/utils.ts +27 -7
- package/src/stream/deferred_stream.test.ts +3 -3
- package/src/stream/deferred_stream.ts +43 -11
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +545 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/stream/stream_channel.test.ts +37 -0
- package/src/stream/stream_channel.ts +43 -3
- package/src/stt/stream_adapter.ts +30 -9
- package/src/stt/stt.ts +131 -22
- package/src/telemetry/index.ts +28 -0
- package/src/telemetry/logging.ts +55 -0
- package/src/telemetry/otel_http_exporter.ts +218 -0
- package/src/telemetry/pino_otel_transport.ts +265 -0
- package/src/telemetry/trace_types.ts +109 -0
- package/src/telemetry/traces.ts +673 -0
- package/src/telemetry/utils.ts +61 -0
- package/src/tokenize/basic/sentence.ts +3 -3
- package/src/tokenize/tokenizer.test.ts +4 -0
- package/src/transcription.ts +6 -0
- package/src/tts/fallback_adapter.ts +579 -0
- package/src/tts/index.ts +1 -0
- package/src/tts/stream_adapter.ts +38 -8
- package/src/tts/tts.ts +245 -62
- package/src/types.ts +62 -33
- package/src/utils.test.ts +90 -10
- package/src/utils.ts +176 -31
- package/src/vad.ts +42 -18
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +347 -2
- package/src/voice/agent.ts +346 -44
- package/src/voice/agent_activity.test.ts +194 -0
- package/src/voice/agent_activity.ts +1457 -388
- package/src/voice/agent_session.ts +817 -112
- package/src/voice/audio_recognition.ts +845 -70
- package/src/voice/audio_recognition_span.test.ts +341 -0
- package/src/voice/avatar/datastream_io.ts +9 -1
- package/src/voice/background_audio.ts +494 -0
- package/src/voice/events.ts +27 -7
- package/src/voice/generation.ts +310 -56
- package/src/voice/generation_tools.test.ts +268 -0
- package/src/voice/index.ts +17 -3
- package/src/voice/interruption_detection.test.ts +151 -0
- package/src/voice/io.ts +115 -12
- package/src/voice/recorder_io/index.ts +4 -0
- package/src/voice/recorder_io/recorder_io.ts +783 -0
- package/src/voice/remote_session.ts +1083 -0
- package/src/voice/report.test.ts +136 -0
- package/src/voice/report.ts +140 -0
- package/src/voice/room_io/_input.ts +45 -10
- package/src/voice/room_io/_output.ts +26 -14
- package/src/voice/room_io/room_io.ts +67 -22
- package/src/voice/speech_handle.ts +38 -6
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +52 -0
- package/src/voice/testing/run_result.ts +995 -0
- package/src/voice/testing/types.ts +118 -0
- package/src/voice/transcription/synchronizer.test.ts +206 -0
- package/src/voice/transcription/synchronizer.ts +204 -19
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +148 -0
- package/src/voice/turn_config/utils.ts +167 -0
- package/src/voice/utils.ts +29 -0
- package/src/worker.ts +92 -78
- package/src/llm/__snapshots__/utils.test.ts.snap +0 -65
|
@@ -18,18 +18,24 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
18
18
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
19
|
var agent_activity_exports = {};
|
|
20
20
|
__export(agent_activity_exports, {
|
|
21
|
-
AgentActivity: () => AgentActivity
|
|
21
|
+
AgentActivity: () => AgentActivity,
|
|
22
|
+
agentActivityStorage: () => agentActivityStorage,
|
|
23
|
+
onEnterStorage: () => onEnterStorage
|
|
22
24
|
});
|
|
23
25
|
module.exports = __toCommonJS(agent_activity_exports);
|
|
24
26
|
var import_mutex = require("@livekit/mutex");
|
|
27
|
+
var import_api = require("@opentelemetry/api");
|
|
25
28
|
var import_heap_js = require("heap-js");
|
|
26
29
|
var import_node_async_hooks = require("node:async_hooks");
|
|
27
30
|
var import_web = require("node:stream/web");
|
|
31
|
+
var import_interruption_detector = require("../inference/interruption/interruption_detector.cjs");
|
|
28
32
|
var import_chat_context = require("../llm/chat_context.cjs");
|
|
29
33
|
var import_llm = require("../llm/index.cjs");
|
|
34
|
+
var import_tool_context = require("../llm/tool_context.cjs");
|
|
30
35
|
var import_log = require("../log.cjs");
|
|
31
|
-
var
|
|
36
|
+
var import_multi_input_stream = require("../stream/multi_input_stream.cjs");
|
|
32
37
|
var import_stt = require("../stt/stt.cjs");
|
|
38
|
+
var import_telemetry = require("../telemetry/index.cjs");
|
|
33
39
|
var import_word = require("../tokenize/basic/word.cjs");
|
|
34
40
|
var import_tts = require("../tts/tts.cjs");
|
|
35
41
|
var import_utils = require("../utils.cjs");
|
|
@@ -40,28 +46,66 @@ var import_audio_recognition = require("./audio_recognition.cjs");
|
|
|
40
46
|
var import_events = require("./events.cjs");
|
|
41
47
|
var import_generation = require("./generation.cjs");
|
|
42
48
|
var import_speech_handle = require("./speech_handle.cjs");
|
|
43
|
-
|
|
49
|
+
var import_utils2 = require("./utils.cjs");
|
|
50
|
+
const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
|
|
51
|
+
const onEnterStorage = new import_node_async_hooks.AsyncLocalStorage();
|
|
44
52
|
class AgentActivity {
|
|
53
|
+
agent;
|
|
54
|
+
agentSession;
|
|
45
55
|
static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
|
|
46
56
|
started = false;
|
|
47
57
|
audioRecognition;
|
|
48
58
|
realtimeSession;
|
|
59
|
+
realtimeSpans;
|
|
60
|
+
// Maps response_id to OTEL span for metrics recording
|
|
49
61
|
turnDetectionMode;
|
|
50
62
|
logger = (0, import_log.log)();
|
|
51
|
-
|
|
63
|
+
_schedulingPaused = true;
|
|
64
|
+
_drainBlockedTasks = [];
|
|
52
65
|
_currentSpeech;
|
|
53
66
|
speechQueue;
|
|
54
67
|
// [priority, timestamp, speechHandle]
|
|
55
68
|
q_updated;
|
|
56
69
|
speechTasks = /* @__PURE__ */ new Set();
|
|
57
70
|
lock = new import_mutex.Mutex();
|
|
58
|
-
audioStream = new
|
|
71
|
+
audioStream = new import_multi_input_stream.MultiInputStream();
|
|
72
|
+
audioStreamId;
|
|
59
73
|
// default to null as None, which maps to the default provider tool choice value
|
|
60
74
|
toolChoice = null;
|
|
61
|
-
|
|
62
|
-
|
|
75
|
+
_preemptiveGeneration;
|
|
76
|
+
interruptionDetector;
|
|
77
|
+
isInterruptionDetectionEnabled;
|
|
78
|
+
isInterruptionByAudioActivityEnabled;
|
|
79
|
+
isDefaultInterruptionByAudioActivityEnabled;
|
|
80
|
+
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
81
|
+
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
82
|
+
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
83
|
+
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
84
|
+
onModelError = (ev) => this.onError(ev);
|
|
85
|
+
onInterruptionOverlappingSpeech = (ev) => {
|
|
86
|
+
this.agentSession.emit(import_events.AgentSessionEventTypes.OverlappingSpeech, ev);
|
|
87
|
+
};
|
|
88
|
+
onInterruptionMetricsCollected = (ev) => {
|
|
89
|
+
this.agentSession._usageCollector.collect(ev);
|
|
90
|
+
this.agentSession.emit(
|
|
91
|
+
import_events.AgentSessionEventTypes.MetricsCollected,
|
|
92
|
+
(0, import_events.createMetricsCollectedEvent)({ metrics: ev })
|
|
93
|
+
);
|
|
94
|
+
};
|
|
95
|
+
onInterruptionError = (ev) => {
|
|
96
|
+
const errorEvent = (0, import_events.createErrorEvent)(ev, this.interruptionDetector);
|
|
97
|
+
this.agentSession.emit(import_events.AgentSessionEventTypes.Error, errorEvent);
|
|
98
|
+
if (!ev.recoverable) {
|
|
99
|
+
this.agentSession._onError(ev);
|
|
100
|
+
this.fallbackToVadInterruption();
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
this.agentSession._onError(ev);
|
|
104
|
+
};
|
|
63
105
|
/** @internal */
|
|
64
106
|
_mainTask;
|
|
107
|
+
_onEnterTask;
|
|
108
|
+
_onExitTask;
|
|
65
109
|
_userTurnCompletedTask;
|
|
66
110
|
constructor(agent, agentSession) {
|
|
67
111
|
this.agent = agent;
|
|
@@ -73,7 +117,7 @@ class AgentActivity {
|
|
|
73
117
|
this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
|
|
74
118
|
if (this.turnDetectionMode === "vad" && this.vad === void 0) {
|
|
75
119
|
this.logger.warn(
|
|
76
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
120
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
|
|
77
121
|
);
|
|
78
122
|
this.turnDetectionMode = void 0;
|
|
79
123
|
}
|
|
@@ -116,89 +160,136 @@ class AgentActivity {
|
|
|
116
160
|
);
|
|
117
161
|
this.turnDetectionMode = void 0;
|
|
118
162
|
}
|
|
119
|
-
if (!this.vad && this.stt && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
|
|
163
|
+
if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
|
|
120
164
|
this.logger.warn(
|
|
121
|
-
"VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
|
|
165
|
+
"VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
|
|
122
166
|
);
|
|
123
167
|
}
|
|
168
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
169
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
170
|
+
this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
171
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
124
172
|
}
|
|
125
173
|
async start() {
|
|
126
174
|
const unlock = await this.lock.lock();
|
|
127
175
|
try {
|
|
128
|
-
this.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
176
|
+
await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
|
|
177
|
+
} finally {
|
|
178
|
+
unlock();
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
async resume() {
|
|
182
|
+
const unlock = await this.lock.lock();
|
|
183
|
+
try {
|
|
184
|
+
await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
|
|
185
|
+
} finally {
|
|
186
|
+
unlock();
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
async _startSession(options) {
|
|
190
|
+
var _a, _b, _c, _d, _e;
|
|
191
|
+
const { spanName, runOnEnter } = options;
|
|
192
|
+
const startSpan = import_telemetry.tracer.startSpan({
|
|
193
|
+
name: spanName,
|
|
194
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
195
|
+
context: import_api.ROOT_CONTEXT
|
|
196
|
+
});
|
|
197
|
+
this.agent._agentActivity = this;
|
|
198
|
+
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
199
|
+
this.realtimeSession = this.llm.session();
|
|
200
|
+
this.realtimeSpans = /* @__PURE__ */ new Map();
|
|
201
|
+
this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
|
|
202
|
+
this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
203
|
+
this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
204
|
+
this.realtimeSession.on(
|
|
205
|
+
"input_audio_transcription_completed",
|
|
206
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
207
|
+
);
|
|
208
|
+
this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
|
|
209
|
+
this.realtimeSession.on("error", this.onModelError);
|
|
210
|
+
(0, import_generation.removeInstructions)(this.agent._chatCtx);
|
|
211
|
+
try {
|
|
212
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
213
|
+
} catch (error) {
|
|
214
|
+
this.logger.error(error, "failed to update the instructions");
|
|
215
|
+
}
|
|
216
|
+
try {
|
|
217
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
218
|
+
} catch (error) {
|
|
219
|
+
this.logger.error(error, "failed to update the chat context");
|
|
220
|
+
}
|
|
221
|
+
try {
|
|
222
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
223
|
+
} catch (error) {
|
|
224
|
+
this.logger.error(error, "failed to update the tools");
|
|
225
|
+
}
|
|
226
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
227
|
+
this.logger.error(
|
|
228
|
+
"audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
|
|
137
229
|
);
|
|
138
|
-
this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
|
|
139
|
-
this.realtimeSession.on("error", (ev) => this.onError(ev));
|
|
140
|
-
(0, import_generation.removeInstructions)(this.agent._chatCtx);
|
|
141
|
-
try {
|
|
142
|
-
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
143
|
-
} catch (error) {
|
|
144
|
-
this.logger.error(error, "failed to update the instructions");
|
|
145
|
-
}
|
|
146
|
-
try {
|
|
147
|
-
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
148
|
-
} catch (error) {
|
|
149
|
-
this.logger.error(error, "failed to update the chat context");
|
|
150
|
-
}
|
|
151
|
-
try {
|
|
152
|
-
await this.realtimeSession.updateTools(this.tools);
|
|
153
|
-
} catch (error) {
|
|
154
|
-
this.logger.error(error, "failed to update the tools");
|
|
155
|
-
}
|
|
156
|
-
} else if (this.llm instanceof import_llm.LLM) {
|
|
157
|
-
try {
|
|
158
|
-
(0, import_generation.updateInstructions)({
|
|
159
|
-
chatCtx: this.agent._chatCtx,
|
|
160
|
-
instructions: this.agent.instructions,
|
|
161
|
-
addIfMissing: true
|
|
162
|
-
});
|
|
163
|
-
} catch (error) {
|
|
164
|
-
this.logger.error("failed to update the instructions", error);
|
|
165
|
-
}
|
|
166
230
|
}
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
231
|
+
} else if (this.llm instanceof import_llm.LLM) {
|
|
232
|
+
try {
|
|
233
|
+
(0, import_generation.updateInstructions)({
|
|
234
|
+
chatCtx: this.agent._chatCtx,
|
|
235
|
+
instructions: this.agent.instructions,
|
|
236
|
+
addIfMissing: true
|
|
237
|
+
});
|
|
238
|
+
} catch (error) {
|
|
239
|
+
this.logger.error("failed to update the instructions", error);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
if (this.llm instanceof import_llm.LLM) {
|
|
243
|
+
this.llm.on("metrics_collected", this.onMetricsCollected);
|
|
244
|
+
this.llm.on("error", this.onModelError);
|
|
245
|
+
}
|
|
246
|
+
if (this.stt instanceof import_stt.STT) {
|
|
247
|
+
this.stt.on("metrics_collected", this.onMetricsCollected);
|
|
248
|
+
this.stt.on("error", this.onModelError);
|
|
249
|
+
}
|
|
250
|
+
if (this.tts instanceof import_tts.TTS) {
|
|
251
|
+
this.tts.on("metrics_collected", this.onMetricsCollected);
|
|
252
|
+
this.tts.on("error", this.onModelError);
|
|
253
|
+
}
|
|
254
|
+
if (this.vad instanceof import_vad.VAD) {
|
|
255
|
+
this.vad.on("metrics_collected", this.onMetricsCollected);
|
|
256
|
+
}
|
|
257
|
+
this.audioRecognition = new import_audio_recognition.AudioRecognition({
|
|
258
|
+
recognitionHooks: this,
|
|
259
|
+
// Disable stt node if stt is not provided
|
|
260
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
|
|
261
|
+
vad: this.vad,
|
|
262
|
+
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
263
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
264
|
+
interruptionDetection: this.interruptionDetector,
|
|
265
|
+
minEndpointingDelay: ((_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.endpointing) == null ? void 0 : _b.minDelay) ?? this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
|
|
266
|
+
maxEndpointingDelay: ((_d = (_c = this.agent.turnHandling) == null ? void 0 : _c.endpointing) == null ? void 0 : _d.maxDelay) ?? this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
|
|
267
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
268
|
+
sttModel: (_e = this.stt) == null ? void 0 : _e.label,
|
|
269
|
+
sttProvider: this.getSttProvider(),
|
|
270
|
+
getLinkedParticipant: () => {
|
|
271
|
+
var _a2;
|
|
272
|
+
return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
|
|
273
|
+
}
|
|
274
|
+
});
|
|
275
|
+
this.audioRecognition.start();
|
|
276
|
+
this.started = true;
|
|
277
|
+
this._resumeSchedulingTask();
|
|
278
|
+
if (runOnEnter) {
|
|
279
|
+
this._onEnterTask = this.createSpeechTask({
|
|
280
|
+
taskFn: () => onEnterStorage.run(
|
|
281
|
+
{ session: this.agentSession, agent: this.agent },
|
|
282
|
+
() => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
283
|
+
name: "on_enter",
|
|
284
|
+
context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
|
|
285
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
286
|
+
})
|
|
287
|
+
),
|
|
288
|
+
inlineTask: true,
|
|
197
289
|
name: "AgentActivity_onEnter"
|
|
198
290
|
});
|
|
199
|
-
} finally {
|
|
200
|
-
unlock();
|
|
201
291
|
}
|
|
292
|
+
startSpan.end();
|
|
202
293
|
}
|
|
203
294
|
get currentSpeech() {
|
|
204
295
|
return this._currentSpeech;
|
|
@@ -209,6 +300,15 @@ class AgentActivity {
|
|
|
209
300
|
get stt() {
|
|
210
301
|
return this.agent.stt || this.agentSession.stt;
|
|
211
302
|
}
|
|
303
|
+
getSttProvider() {
|
|
304
|
+
var _a;
|
|
305
|
+
const label = (_a = this.stt) == null ? void 0 : _a.label;
|
|
306
|
+
if (!label) {
|
|
307
|
+
return void 0;
|
|
308
|
+
}
|
|
309
|
+
const [provider] = label.split("-", 1);
|
|
310
|
+
return provider || label;
|
|
311
|
+
}
|
|
212
312
|
get llm() {
|
|
213
313
|
return this.agent.llm || this.agentSession.llm;
|
|
214
314
|
}
|
|
@@ -218,21 +318,46 @@ class AgentActivity {
|
|
|
218
318
|
get tools() {
|
|
219
319
|
return this.agent.toolCtx;
|
|
220
320
|
}
|
|
221
|
-
get
|
|
222
|
-
return this.
|
|
321
|
+
get schedulingPaused() {
|
|
322
|
+
return this._schedulingPaused;
|
|
223
323
|
}
|
|
224
324
|
get realtimeLLMSession() {
|
|
225
325
|
return this.realtimeSession;
|
|
226
326
|
}
|
|
227
327
|
get allowInterruptions() {
|
|
228
|
-
|
|
328
|
+
var _a, _b;
|
|
329
|
+
return ((_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.interruption) == null ? void 0 : _b.enabled) ?? this.agentSession.sessionOptions.turnHandling.interruption.enabled;
|
|
330
|
+
}
|
|
331
|
+
get useTtsAlignedTranscript() {
|
|
332
|
+
return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
|
|
229
333
|
}
|
|
230
334
|
get turnDetection() {
|
|
231
|
-
|
|
335
|
+
var _a;
|
|
336
|
+
return ((_a = this.agent.turnHandling) == null ? void 0 : _a.turnDetection) ?? this.agentSession.turnDetection;
|
|
232
337
|
}
|
|
338
|
+
get turnHandling() {
|
|
339
|
+
return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
|
|
340
|
+
}
|
|
341
|
+
// get minEndpointingDelay(): number {
|
|
342
|
+
// return (
|
|
343
|
+
// this.agent.turnHandling?.endpointing?.minDelay ??
|
|
344
|
+
// this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
|
|
345
|
+
// );
|
|
346
|
+
// }
|
|
347
|
+
// get maxEndpointingDelay(): number {
|
|
348
|
+
// return (
|
|
349
|
+
// this.agent.turnHandling?.endpointing?.maxDelay ??
|
|
350
|
+
// this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
|
|
351
|
+
// );
|
|
352
|
+
// }
|
|
233
353
|
get toolCtx() {
|
|
234
354
|
return this.agent.toolCtx;
|
|
235
355
|
}
|
|
356
|
+
/** @internal */
|
|
357
|
+
get inputStartedAt() {
|
|
358
|
+
var _a;
|
|
359
|
+
return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
|
|
360
|
+
}
|
|
236
361
|
async updateChatCtx(chatCtx) {
|
|
237
362
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
238
363
|
this.agent._chatCtx = chatCtx;
|
|
@@ -247,36 +372,79 @@ class AgentActivity {
|
|
|
247
372
|
});
|
|
248
373
|
}
|
|
249
374
|
}
|
|
250
|
-
|
|
375
|
+
// TODO: Add when AgentConfigUpdate is ported to ChatContext.
|
|
376
|
+
async updateTools(tools) {
|
|
377
|
+
this.agent._tools = { ...tools };
|
|
378
|
+
if (this.realtimeSession) {
|
|
379
|
+
await this.realtimeSession.updateTools(tools);
|
|
380
|
+
}
|
|
381
|
+
if (this.llm instanceof import_llm.LLM) {
|
|
382
|
+
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
updateOptions({
|
|
386
|
+
toolChoice,
|
|
387
|
+
turnDetection
|
|
388
|
+
}) {
|
|
251
389
|
if (toolChoice !== void 0) {
|
|
252
390
|
this.toolChoice = toolChoice;
|
|
253
391
|
}
|
|
254
392
|
if (this.realtimeSession) {
|
|
255
393
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
256
394
|
}
|
|
395
|
+
if (turnDetection !== void 0) {
|
|
396
|
+
this.turnDetectionMode = turnDetection;
|
|
397
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
398
|
+
if (this.agentSession.agentState !== "speaking") {
|
|
399
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
if (this.audioRecognition) {
|
|
403
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
404
|
+
}
|
|
257
405
|
}
|
|
258
406
|
attachAudioInput(audioStream) {
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
407
|
+
void this.audioStream.close();
|
|
408
|
+
this.audioStream = new import_multi_input_stream.MultiInputStream();
|
|
409
|
+
const aecWarmupAudioFilter = new import_web.TransformStream({
|
|
410
|
+
transform: (frame, controller) => {
|
|
411
|
+
const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
|
|
412
|
+
if (!shouldDiscardForAecWarmup) {
|
|
413
|
+
controller.enqueue(frame);
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
418
|
+
if (this.realtimeSession && this.audioRecognition) {
|
|
419
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
|
|
266
420
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
267
|
-
}
|
|
268
|
-
if (this.audioRecognition) {
|
|
269
421
|
this.audioRecognition.setInputAudioStream(recognitionAudioStream);
|
|
422
|
+
} else if (this.realtimeSession) {
|
|
423
|
+
this.realtimeSession.setInputAudioStream(
|
|
424
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter)
|
|
425
|
+
);
|
|
426
|
+
} else if (this.audioRecognition) {
|
|
427
|
+
this.audioRecognition.setInputAudioStream(
|
|
428
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter)
|
|
429
|
+
);
|
|
270
430
|
}
|
|
271
431
|
}
|
|
272
432
|
detachAudioInput() {
|
|
273
|
-
this.
|
|
433
|
+
if (this.audioStreamId === void 0) {
|
|
434
|
+
return;
|
|
435
|
+
}
|
|
436
|
+
void this.audioStream.close();
|
|
437
|
+
this.audioStream = new import_multi_input_stream.MultiInputStream();
|
|
438
|
+
this.audioStreamId = void 0;
|
|
274
439
|
}
|
|
275
|
-
commitUserTurn() {
|
|
440
|
+
commitUserTurn(options = {}) {
|
|
441
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
276
442
|
if (!this.audioRecognition) {
|
|
277
|
-
|
|
443
|
+
if (throwIfNotReady) {
|
|
444
|
+
throw new Error("AudioRecognition is not initialized");
|
|
445
|
+
}
|
|
446
|
+
return;
|
|
278
447
|
}
|
|
279
|
-
const audioDetached = false;
|
|
280
448
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
281
449
|
}
|
|
282
450
|
clearUserTurn() {
|
|
@@ -312,22 +480,28 @@ class AgentActivity {
|
|
|
312
480
|
})
|
|
313
481
|
);
|
|
314
482
|
const task = this.createSpeechTask({
|
|
315
|
-
|
|
316
|
-
(abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
|
|
317
|
-
),
|
|
483
|
+
taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
318
484
|
ownedSpeechHandle: handle,
|
|
319
485
|
name: "AgentActivity.say_tts"
|
|
320
486
|
});
|
|
321
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
487
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
322
488
|
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
323
489
|
return handle;
|
|
324
490
|
}
|
|
325
491
|
// -- Metrics and errors --
|
|
326
492
|
onMetricsCollected = (ev) => {
|
|
327
|
-
const speechHandle = speechHandleStorage.getStore();
|
|
493
|
+
const speechHandle = import_agent.speechHandleStorage.getStore();
|
|
328
494
|
if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
|
|
329
495
|
ev.speechId = speechHandle.id;
|
|
330
496
|
}
|
|
497
|
+
if (ev.type === "realtime_model_metrics" && this.realtimeSpans) {
|
|
498
|
+
const span = this.realtimeSpans.get(ev.requestId);
|
|
499
|
+
if (span) {
|
|
500
|
+
(0, import_telemetry.recordRealtimeMetrics)(span, ev);
|
|
501
|
+
this.realtimeSpans.delete(ev.requestId);
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
this.agentSession._usageCollector.collect(ev);
|
|
331
505
|
this.agentSession.emit(
|
|
332
506
|
import_events.AgentSessionEventTypes.MetricsCollected,
|
|
333
507
|
(0, import_events.createMetricsCollectedEvent)({ metrics: ev })
|
|
@@ -354,6 +528,13 @@ class AgentActivity {
|
|
|
354
528
|
this.logger.info("onInputSpeechStarted");
|
|
355
529
|
if (!this.vad) {
|
|
356
530
|
this.agentSession._updateUserState("speaking");
|
|
531
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
532
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
533
|
+
0,
|
|
534
|
+
Date.now(),
|
|
535
|
+
this.agentSession._userSpeakingSpan
|
|
536
|
+
);
|
|
537
|
+
}
|
|
357
538
|
}
|
|
358
539
|
try {
|
|
359
540
|
this.interrupt();
|
|
@@ -367,6 +548,9 @@ class AgentActivity {
|
|
|
367
548
|
onInputSpeechStopped(ev) {
|
|
368
549
|
this.logger.info(ev, "onInputSpeechStopped");
|
|
369
550
|
if (!this.vad) {
|
|
551
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
552
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
553
|
+
}
|
|
370
554
|
this.agentSession._updateUserState("listening");
|
|
371
555
|
}
|
|
372
556
|
if (ev.userTranscriptionEnabled) {
|
|
@@ -401,8 +585,8 @@ class AgentActivity {
|
|
|
401
585
|
if (ev.userInitiated) {
|
|
402
586
|
return;
|
|
403
587
|
}
|
|
404
|
-
if (this.
|
|
405
|
-
this.logger.warn("skipping new realtime generation, the
|
|
588
|
+
if (this.schedulingPaused) {
|
|
589
|
+
this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
|
|
406
590
|
return;
|
|
407
591
|
}
|
|
408
592
|
const handle = import_speech_handle.SpeechHandle.create({
|
|
@@ -418,45 +602,91 @@ class AgentActivity {
|
|
|
418
602
|
);
|
|
419
603
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
420
604
|
this.createSpeechTask({
|
|
421
|
-
|
|
422
|
-
(abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
|
|
423
|
-
),
|
|
605
|
+
taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
424
606
|
ownedSpeechHandle: handle,
|
|
425
607
|
name: "AgentActivity.realtimeGeneration"
|
|
426
608
|
});
|
|
427
609
|
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
428
610
|
}
|
|
429
611
|
// recognition hooks
|
|
430
|
-
onStartOfSpeech(
|
|
431
|
-
|
|
612
|
+
onStartOfSpeech(ev) {
|
|
613
|
+
let speechStartTime = Date.now();
|
|
614
|
+
if (ev) {
|
|
615
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
616
|
+
}
|
|
617
|
+
this.agentSession._updateUserState("speaking", {
|
|
618
|
+
lastSpeakingTime: speechStartTime,
|
|
619
|
+
otelContext: import_api.context.active()
|
|
620
|
+
});
|
|
621
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
622
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
623
|
+
ev.speechDuration,
|
|
624
|
+
speechStartTime,
|
|
625
|
+
this.agentSession._userSpeakingSpan
|
|
626
|
+
);
|
|
627
|
+
}
|
|
432
628
|
}
|
|
433
|
-
onEndOfSpeech(
|
|
434
|
-
|
|
629
|
+
onEndOfSpeech(ev) {
|
|
630
|
+
let speechEndTime = Date.now();
|
|
631
|
+
if (ev) {
|
|
632
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
633
|
+
}
|
|
634
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
635
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
636
|
+
speechEndTime,
|
|
637
|
+
this.agentSession._userSpeakingSpan
|
|
638
|
+
);
|
|
639
|
+
}
|
|
640
|
+
this.agentSession._updateUserState("listening", {
|
|
641
|
+
lastSpeakingTime: speechEndTime,
|
|
642
|
+
otelContext: import_api.context.active()
|
|
643
|
+
});
|
|
435
644
|
}
|
|
436
645
|
onVADInferenceDone(ev) {
|
|
437
|
-
var _a
|
|
646
|
+
var _a;
|
|
438
647
|
if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
|
|
439
648
|
return;
|
|
440
649
|
}
|
|
441
|
-
if (
|
|
650
|
+
if (ev.speechDuration >= ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
|
|
651
|
+
this.interruptByAudioActivity();
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
interruptByAudioActivity() {
|
|
655
|
+
var _a, _b, _c, _d;
|
|
656
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
442
657
|
return;
|
|
443
658
|
}
|
|
444
|
-
if (
|
|
659
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
660
|
+
return;
|
|
661
|
+
}
|
|
662
|
+
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
445
663
|
return;
|
|
446
664
|
}
|
|
447
|
-
if (this.stt && this.agentSession.
|
|
665
|
+
if (this.stt && ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
|
|
448
666
|
const text = this.audioRecognition.currentTranscript;
|
|
449
|
-
|
|
667
|
+
const normalizedText = text ?? "";
|
|
668
|
+
const wordCount = (0, import_word.splitWords)(normalizedText, true).length;
|
|
669
|
+
if (wordCount < ((_b = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
450
670
|
return;
|
|
451
671
|
}
|
|
452
672
|
}
|
|
453
|
-
(
|
|
673
|
+
(_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
|
|
454
674
|
if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
|
|
455
|
-
this.logger.info(
|
|
456
|
-
|
|
675
|
+
this.logger.info(
|
|
676
|
+
{ "speech id": this._currentSpeech.id },
|
|
677
|
+
"speech interrupted by audio activity"
|
|
678
|
+
);
|
|
679
|
+
(_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
|
|
457
680
|
this._currentSpeech.interrupt();
|
|
458
681
|
}
|
|
459
682
|
}
|
|
683
|
+
onInterruption(ev) {
|
|
684
|
+
this.restoreInterruptionByAudioActivity();
|
|
685
|
+
this.interruptByAudioActivity();
|
|
686
|
+
if (this.audioRecognition) {
|
|
687
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
|
|
688
|
+
}
|
|
689
|
+
}
|
|
460
690
|
onInterimTranscript(ev) {
|
|
461
691
|
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
462
692
|
return;
|
|
@@ -465,10 +695,14 @@ class AgentActivity {
|
|
|
465
695
|
import_events.AgentSessionEventTypes.UserInputTranscribed,
|
|
466
696
|
(0, import_events.createUserInputTranscribedEvent)({
|
|
467
697
|
transcript: ev.alternatives[0].text,
|
|
468
|
-
isFinal: false
|
|
698
|
+
isFinal: false,
|
|
699
|
+
language: ev.alternatives[0].language
|
|
469
700
|
// TODO(AJS-106): add multi participant support
|
|
470
701
|
})
|
|
471
702
|
);
|
|
703
|
+
if (ev.alternatives[0].text) {
|
|
704
|
+
this.interruptByAudioActivity();
|
|
705
|
+
}
|
|
472
706
|
}
|
|
473
707
|
onFinalTranscript(ev) {
|
|
474
708
|
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
@@ -478,13 +712,70 @@ class AgentActivity {
|
|
|
478
712
|
import_events.AgentSessionEventTypes.UserInputTranscribed,
|
|
479
713
|
(0, import_events.createUserInputTranscribedEvent)({
|
|
480
714
|
transcript: ev.alternatives[0].text,
|
|
481
|
-
isFinal: true
|
|
715
|
+
isFinal: true,
|
|
716
|
+
language: ev.alternatives[0].language
|
|
482
717
|
// TODO(AJS-106): add multi participant support
|
|
483
718
|
})
|
|
484
719
|
);
|
|
720
|
+
if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
|
|
721
|
+
this.interruptByAudioActivity();
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
onPreemptiveGeneration(info) {
|
|
725
|
+
if (!this.agentSession.sessionOptions.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
|
|
726
|
+
return;
|
|
727
|
+
}
|
|
728
|
+
this.cancelPreemptiveGeneration();
|
|
729
|
+
this.logger.info(
|
|
730
|
+
{
|
|
731
|
+
newTranscript: info.newTranscript,
|
|
732
|
+
transcriptConfidence: info.transcriptConfidence
|
|
733
|
+
},
|
|
734
|
+
"starting preemptive generation"
|
|
735
|
+
);
|
|
736
|
+
const userMessage = import_chat_context.ChatMessage.create({
|
|
737
|
+
role: "user",
|
|
738
|
+
content: info.newTranscript,
|
|
739
|
+
transcriptConfidence: info.transcriptConfidence
|
|
740
|
+
});
|
|
741
|
+
const chatCtx = this.agent.chatCtx.copy();
|
|
742
|
+
const speechHandle = this.generateReply({
|
|
743
|
+
userMessage,
|
|
744
|
+
chatCtx,
|
|
745
|
+
scheduleSpeech: false
|
|
746
|
+
});
|
|
747
|
+
this._preemptiveGeneration = {
|
|
748
|
+
speechHandle,
|
|
749
|
+
userMessage,
|
|
750
|
+
info,
|
|
751
|
+
chatCtx: chatCtx.copy(),
|
|
752
|
+
tools: { ...this.tools },
|
|
753
|
+
toolChoice: this.toolChoice,
|
|
754
|
+
createdAt: Date.now()
|
|
755
|
+
};
|
|
756
|
+
}
|
|
757
|
+
cancelPreemptiveGeneration() {
|
|
758
|
+
if (this._preemptiveGeneration !== void 0) {
|
|
759
|
+
this._preemptiveGeneration.speechHandle._cancel();
|
|
760
|
+
this._preemptiveGeneration = void 0;
|
|
761
|
+
}
|
|
485
762
|
}
|
|
486
763
|
createSpeechTask(options) {
|
|
487
|
-
const {
|
|
764
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
765
|
+
const wrappedFn = (ctrl) => {
|
|
766
|
+
return agentActivityStorage.run(this, () => {
|
|
767
|
+
const currentTask = import_utils.Task.current();
|
|
768
|
+
if (currentTask) {
|
|
769
|
+
(0, import_agent._setActivityTaskInfo)(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
770
|
+
}
|
|
771
|
+
if (ownedSpeechHandle) {
|
|
772
|
+
return import_agent.speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
773
|
+
}
|
|
774
|
+
return taskFn(ctrl);
|
|
775
|
+
});
|
|
776
|
+
};
|
|
777
|
+
const task = import_utils.Task.from(wrappedFn, controller, name);
|
|
778
|
+
(0, import_agent._setActivityTaskInfo)(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
488
779
|
this.speechTasks.add(task);
|
|
489
780
|
task.addDoneCallback(() => {
|
|
490
781
|
this.speechTasks.delete(task);
|
|
@@ -500,20 +791,35 @@ class AgentActivity {
|
|
|
500
791
|
task.addDoneCallback(() => {
|
|
501
792
|
this.wakeupMainTask();
|
|
502
793
|
});
|
|
503
|
-
return task
|
|
794
|
+
return task;
|
|
504
795
|
}
|
|
505
796
|
async onEndOfTurn(info) {
|
|
506
|
-
|
|
507
|
-
|
|
797
|
+
var _a, _b;
|
|
798
|
+
if (this.schedulingPaused) {
|
|
799
|
+
this.cancelPreemptiveGeneration();
|
|
800
|
+
this.logger.warn(
|
|
801
|
+
{ user_input: info.newTranscript },
|
|
802
|
+
"skipping user input, speech scheduling is paused"
|
|
803
|
+
);
|
|
508
804
|
return true;
|
|
509
805
|
}
|
|
510
|
-
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.
|
|
511
|
-
|
|
512
|
-
|
|
806
|
+
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
|
|
807
|
+
const wordCount = (0, import_word.splitWords)(info.newTranscript, true).length;
|
|
808
|
+
if (wordCount < ((_b = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
809
|
+
this.cancelPreemptiveGeneration();
|
|
810
|
+
this.logger.info(
|
|
811
|
+
{
|
|
812
|
+
wordCount,
|
|
813
|
+
minInterruptionWords: this.agentSession.sessionOptions.turnHandling.interruption.minWords
|
|
814
|
+
},
|
|
815
|
+
"skipping user input, word count below minimum interruption threshold"
|
|
816
|
+
);
|
|
817
|
+
return false;
|
|
818
|
+
}
|
|
513
819
|
}
|
|
514
820
|
const oldTask = this._userTurnCompletedTask;
|
|
515
821
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
516
|
-
|
|
822
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
517
823
|
name: "AgentActivity.userTurnCompleted"
|
|
518
824
|
});
|
|
519
825
|
return true;
|
|
@@ -538,19 +844,49 @@ class AgentActivity {
|
|
|
538
844
|
throw new Error("Speech queue is empty");
|
|
539
845
|
}
|
|
540
846
|
const speechHandle = heapItem[2];
|
|
847
|
+
if (speechHandle.interrupted || speechHandle.done()) {
|
|
848
|
+
continue;
|
|
849
|
+
}
|
|
541
850
|
this._currentSpeech = speechHandle;
|
|
542
851
|
speechHandle._authorizeGeneration();
|
|
543
|
-
await speechHandle._waitForGeneration();
|
|
852
|
+
await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
|
|
544
853
|
this._currentSpeech = void 0;
|
|
545
854
|
}
|
|
546
|
-
|
|
547
|
-
|
|
855
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
856
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
857
|
+
this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
|
|
548
858
|
break;
|
|
549
859
|
}
|
|
550
860
|
this.q_updated = new import_utils.Future();
|
|
551
861
|
}
|
|
552
862
|
this.logger.info("AgentActivity mainTask: exiting");
|
|
553
863
|
}
|
|
864
|
+
getDrainPendingSpeechTasks() {
|
|
865
|
+
const blockedHandles = [];
|
|
866
|
+
for (const task of this._drainBlockedTasks) {
|
|
867
|
+
const info = (0, import_agent._getActivityTaskInfo)(task);
|
|
868
|
+
if (!info) {
|
|
869
|
+
this.logger.error("blocked task without activity info; skipping.");
|
|
870
|
+
continue;
|
|
871
|
+
}
|
|
872
|
+
if (!info.speechHandle) {
|
|
873
|
+
continue;
|
|
874
|
+
}
|
|
875
|
+
blockedHandles.push(info.speechHandle);
|
|
876
|
+
}
|
|
877
|
+
const toWait = [];
|
|
878
|
+
for (const task of this.speechTasks) {
|
|
879
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
880
|
+
continue;
|
|
881
|
+
}
|
|
882
|
+
const info = (0, import_agent._getActivityTaskInfo)(task);
|
|
883
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
884
|
+
continue;
|
|
885
|
+
}
|
|
886
|
+
toWait.push(task);
|
|
887
|
+
}
|
|
888
|
+
return toWait;
|
|
889
|
+
}
|
|
554
890
|
wakeupMainTask() {
|
|
555
891
|
this.q_updated.resolve();
|
|
556
892
|
}
|
|
@@ -561,7 +897,8 @@ class AgentActivity {
|
|
|
561
897
|
chatCtx,
|
|
562
898
|
instructions: defaultInstructions,
|
|
563
899
|
toolChoice: defaultToolChoice,
|
|
564
|
-
allowInterruptions: defaultAllowInterruptions
|
|
900
|
+
allowInterruptions: defaultAllowInterruptions,
|
|
901
|
+
scheduleSpeech = true
|
|
565
902
|
} = options;
|
|
566
903
|
let instructions = defaultInstructions;
|
|
567
904
|
let toolChoice = defaultToolChoice;
|
|
@@ -575,7 +912,7 @@ class AgentActivity {
|
|
|
575
912
|
if (this.llm === void 0) {
|
|
576
913
|
throw new Error("trying to generate reply without an LLM model");
|
|
577
914
|
}
|
|
578
|
-
const functionCall = (_a = import_agent.
|
|
915
|
+
const functionCall = (_a = import_agent.functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
|
|
579
916
|
if (toolChoice === void 0 && functionCall !== void 0) {
|
|
580
917
|
toolChoice = "none";
|
|
581
918
|
}
|
|
@@ -593,19 +930,17 @@ class AgentActivity {
|
|
|
593
930
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
594
931
|
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
595
932
|
this.createSpeechTask({
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
})
|
|
608
|
-
),
|
|
933
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
934
|
+
speechHandle: handle,
|
|
935
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
936
|
+
userInput: userMessage == null ? void 0 : userMessage.textContent,
|
|
937
|
+
instructions,
|
|
938
|
+
modelSettings: {
|
|
939
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
940
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
941
|
+
},
|
|
942
|
+
abortController
|
|
943
|
+
}),
|
|
609
944
|
ownedSpeechHandle: handle,
|
|
610
945
|
name: "AgentActivity.realtimeReply"
|
|
611
946
|
});
|
|
@@ -614,39 +949,56 @@ class AgentActivity {
|
|
|
614
949
|
instructions = `${this.agent.instructions}
|
|
615
950
|
${instructions}`;
|
|
616
951
|
}
|
|
952
|
+
const onEnterData = onEnterStorage.getStore();
|
|
953
|
+
const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
|
|
954
|
+
const tools = shouldFilterTools ? Object.fromEntries(
|
|
955
|
+
Object.entries(this.agent.toolCtx).filter(
|
|
956
|
+
([, fnTool]) => !(fnTool.flags & import_llm.ToolFlag.IGNORE_ON_ENTER)
|
|
957
|
+
)
|
|
958
|
+
) : this.agent.toolCtx;
|
|
617
959
|
const task = this.createSpeechTask({
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
${instructions}` : instructions,
|
|
629
|
-
userMessage
|
|
630
|
-
)
|
|
960
|
+
taskFn: (abortController) => this.pipelineReplyTask(
|
|
961
|
+
handle,
|
|
962
|
+
chatCtx ?? this.agent.chatCtx,
|
|
963
|
+
tools,
|
|
964
|
+
{
|
|
965
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
966
|
+
},
|
|
967
|
+
abortController,
|
|
968
|
+
instructions,
|
|
969
|
+
userMessage
|
|
631
970
|
),
|
|
632
971
|
ownedSpeechHandle: handle,
|
|
633
972
|
name: "AgentActivity.pipelineReply"
|
|
634
973
|
});
|
|
635
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
974
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
975
|
+
}
|
|
976
|
+
if (scheduleSpeech) {
|
|
977
|
+
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
636
978
|
}
|
|
637
|
-
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
638
979
|
return handle;
|
|
639
980
|
}
|
|
640
|
-
interrupt() {
|
|
981
|
+
interrupt(options = {}) {
|
|
641
982
|
var _a;
|
|
983
|
+
const { force = false } = options;
|
|
984
|
+
this.cancelPreemptiveGeneration();
|
|
642
985
|
const future = new import_utils.Future();
|
|
643
986
|
const currentSpeech = this._currentSpeech;
|
|
644
|
-
currentSpeech == null ? void 0 : currentSpeech.interrupt();
|
|
987
|
+
currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
|
|
645
988
|
for (const [_, __, speech] of this.speechQueue) {
|
|
646
|
-
speech.interrupt();
|
|
989
|
+
speech.interrupt(force);
|
|
647
990
|
}
|
|
648
991
|
(_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
|
|
649
|
-
if (
|
|
992
|
+
if (force) {
|
|
993
|
+
for (const task of this.speechTasks) {
|
|
994
|
+
task.cancel();
|
|
995
|
+
}
|
|
996
|
+
if (currentSpeech && !currentSpeech.done()) {
|
|
997
|
+
currentSpeech._markDone();
|
|
998
|
+
}
|
|
999
|
+
this.speechQueue.clear();
|
|
1000
|
+
future.resolve();
|
|
1001
|
+
} else if (currentSpeech === void 0) {
|
|
650
1002
|
future.resolve();
|
|
651
1003
|
} else {
|
|
652
1004
|
currentSpeech.addDoneCallback(() => {
|
|
@@ -664,7 +1016,7 @@ ${instructions}` : instructions,
|
|
|
664
1016
|
async userTurnCompleted(info, oldTask) {
|
|
665
1017
|
var _a, _b;
|
|
666
1018
|
if (oldTask) {
|
|
667
|
-
await oldTask;
|
|
1019
|
+
await oldTask.result;
|
|
668
1020
|
}
|
|
669
1021
|
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
670
1022
|
if (this.llm.capabilities.turnDetection) {
|
|
@@ -689,7 +1041,8 @@ ${instructions}` : instructions,
|
|
|
689
1041
|
}
|
|
690
1042
|
let userMessage = import_chat_context.ChatMessage.create({
|
|
691
1043
|
role: "user",
|
|
692
|
-
content: info.newTranscript
|
|
1044
|
+
content: info.newTranscript,
|
|
1045
|
+
transcriptConfidence: info.transcriptConfidence
|
|
693
1046
|
});
|
|
694
1047
|
const chatCtx = this.agent.chatCtx.copy();
|
|
695
1048
|
const startTime = Date.now();
|
|
@@ -707,13 +1060,57 @@ ${instructions}` : instructions,
|
|
|
707
1060
|
} else if (this.llm === void 0) {
|
|
708
1061
|
return;
|
|
709
1062
|
}
|
|
710
|
-
const
|
|
1063
|
+
const userMetricsReport = {};
|
|
1064
|
+
if (info.startedSpeakingAt !== void 0) {
|
|
1065
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
|
|
1066
|
+
}
|
|
1067
|
+
if (info.stoppedSpeakingAt !== void 0) {
|
|
1068
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
|
|
1069
|
+
}
|
|
1070
|
+
if (info.transcriptionDelay !== void 0) {
|
|
1071
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
|
|
1072
|
+
}
|
|
1073
|
+
if (info.endOfUtteranceDelay !== void 0) {
|
|
1074
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
|
|
1075
|
+
}
|
|
1076
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
|
|
1077
|
+
if (userMessage) {
|
|
1078
|
+
userMessage.metrics = userMetricsReport;
|
|
1079
|
+
}
|
|
1080
|
+
let speechHandle;
|
|
1081
|
+
if (this._preemptiveGeneration !== void 0) {
|
|
1082
|
+
const preemptive = this._preemptiveGeneration;
|
|
1083
|
+
if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && (0, import_tool_context.isSameToolContext)(preemptive.tools, this.tools) && (0, import_tool_context.isSameToolChoice)(preemptive.toolChoice, this.toolChoice)) {
|
|
1084
|
+
speechHandle = preemptive.speechHandle;
|
|
1085
|
+
if (preemptive.userMessage && userMessage) {
|
|
1086
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1087
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1088
|
+
}
|
|
1089
|
+
this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
1090
|
+
this.logger.debug(
|
|
1091
|
+
{
|
|
1092
|
+
preemptiveLeadTime: Date.now() - preemptive.createdAt
|
|
1093
|
+
},
|
|
1094
|
+
"using preemptive generation"
|
|
1095
|
+
);
|
|
1096
|
+
} else {
|
|
1097
|
+
this.logger.warn(
|
|
1098
|
+
"preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`"
|
|
1099
|
+
);
|
|
1100
|
+
preemptive.speechHandle._cancel();
|
|
1101
|
+
}
|
|
1102
|
+
this._preemptiveGeneration = void 0;
|
|
1103
|
+
}
|
|
1104
|
+
if (speechHandle === void 0) {
|
|
1105
|
+
speechHandle = this.generateReply({ userMessage, chatCtx });
|
|
1106
|
+
}
|
|
711
1107
|
const eouMetrics = {
|
|
712
1108
|
type: "eou_metrics",
|
|
713
1109
|
timestamp: Date.now(),
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
1110
|
+
endOfUtteranceDelayMs: info.endOfUtteranceDelay,
|
|
1111
|
+
transcriptionDelayMs: info.transcriptionDelay,
|
|
1112
|
+
onUserTurnCompletedDelayMs: callbackDuration,
|
|
1113
|
+
lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
|
|
717
1114
|
speechId: speechHandle.id
|
|
718
1115
|
};
|
|
719
1116
|
this.agentSession.emit(
|
|
@@ -722,7 +1119,9 @@ ${instructions}` : instructions,
|
|
|
722
1119
|
);
|
|
723
1120
|
}
|
|
724
1121
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
725
|
-
|
|
1122
|
+
var _a, _b;
|
|
1123
|
+
speechHandle._agentTurnContext = import_api.context.active();
|
|
1124
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
726
1125
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
727
1126
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
728
1127
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
@@ -753,25 +1152,38 @@ ${instructions}` : instructions,
|
|
|
753
1152
|
textOut = _textOut;
|
|
754
1153
|
tasks.push(textForwardTask);
|
|
755
1154
|
}
|
|
756
|
-
|
|
757
|
-
|
|
1155
|
+
let replyStartedSpeakingAt;
|
|
1156
|
+
let replyTtsGenData = null;
|
|
1157
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1158
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1159
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1160
|
+
startTime: startedSpeakingAt,
|
|
1161
|
+
otelContext: speechHandle._agentTurnContext
|
|
1162
|
+
});
|
|
1163
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1164
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1165
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1166
|
+
}
|
|
758
1167
|
};
|
|
759
1168
|
if (!audioOutput) {
|
|
760
1169
|
if (textOut) {
|
|
761
|
-
textOut.firstTextFut.await.
|
|
1170
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
762
1171
|
}
|
|
763
1172
|
} else {
|
|
764
1173
|
let audioOut = null;
|
|
765
1174
|
if (!audio) {
|
|
766
|
-
const [ttsTask,
|
|
1175
|
+
const [ttsTask, ttsGenData] = (0, import_generation.performTTSInference)(
|
|
767
1176
|
(...args) => this.agent.ttsNode(...args),
|
|
768
1177
|
audioSource,
|
|
769
1178
|
modelSettings,
|
|
770
|
-
replyAbortController
|
|
1179
|
+
replyAbortController,
|
|
1180
|
+
(_a = this.tts) == null ? void 0 : _a.model,
|
|
1181
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
771
1182
|
);
|
|
772
1183
|
tasks.push(ttsTask);
|
|
1184
|
+
replyTtsGenData = ttsGenData;
|
|
773
1185
|
const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
|
|
774
|
-
|
|
1186
|
+
ttsGenData.audioStream,
|
|
775
1187
|
audioOutput,
|
|
776
1188
|
replyAbortController
|
|
777
1189
|
);
|
|
@@ -786,7 +1198,7 @@ ${instructions}` : instructions,
|
|
|
786
1198
|
tasks.push(forwardTask);
|
|
787
1199
|
audioOut = _audioOut;
|
|
788
1200
|
}
|
|
789
|
-
audioOut.firstFrameFut.await.
|
|
1201
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
790
1202
|
}
|
|
791
1203
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
792
1204
|
if (audioOutput) {
|
|
@@ -801,28 +1213,63 @@ ${instructions}` : instructions,
|
|
|
801
1213
|
}
|
|
802
1214
|
}
|
|
803
1215
|
if (addToChatCtx) {
|
|
1216
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1217
|
+
const replyAssistantMetrics = {};
|
|
1218
|
+
if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
|
|
1219
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1220
|
+
}
|
|
1221
|
+
if (replyStartedSpeakingAt !== void 0) {
|
|
1222
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
|
|
1223
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
|
|
1224
|
+
}
|
|
804
1225
|
const message = import_chat_context.ChatMessage.create({
|
|
805
1226
|
role: "assistant",
|
|
806
1227
|
content: (textOut == null ? void 0 : textOut.text) || "",
|
|
807
|
-
interrupted: speechHandle.interrupted
|
|
1228
|
+
interrupted: speechHandle.interrupted,
|
|
1229
|
+
metrics: replyAssistantMetrics
|
|
808
1230
|
});
|
|
809
1231
|
this.agent._chatCtx.insert(message);
|
|
810
1232
|
this.agentSession._conversationItemAdded(message);
|
|
811
1233
|
}
|
|
812
1234
|
if (this.agentSession.agentState === "speaking") {
|
|
813
1235
|
this.agentSession._updateAgentState("listening");
|
|
1236
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1237
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1238
|
+
}
|
|
1239
|
+
this.restoreInterruptionByAudioActivity();
|
|
814
1240
|
}
|
|
815
1241
|
}
|
|
816
|
-
async
|
|
817
|
-
|
|
818
|
-
|
|
1242
|
+
_pipelineReplyTaskImpl = async ({
|
|
1243
|
+
speechHandle,
|
|
1244
|
+
chatCtx,
|
|
1245
|
+
toolCtx,
|
|
1246
|
+
modelSettings,
|
|
1247
|
+
replyAbortController,
|
|
1248
|
+
instructions,
|
|
1249
|
+
newMessage,
|
|
1250
|
+
toolsMessages,
|
|
1251
|
+
span,
|
|
1252
|
+
_previousUserMetrics
|
|
1253
|
+
}) => {
|
|
1254
|
+
var _a, _b, _c, _d, _e, _f;
|
|
1255
|
+
speechHandle._agentTurnContext = import_api.context.active();
|
|
1256
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1257
|
+
if (instructions) {
|
|
1258
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_INSTRUCTIONS, instructions);
|
|
1259
|
+
}
|
|
1260
|
+
if (newMessage) {
|
|
1261
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_USER_INPUT, newMessage.textContent || "");
|
|
1262
|
+
}
|
|
1263
|
+
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1264
|
+
if (localParticipant) {
|
|
1265
|
+
(0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
|
|
1266
|
+
}
|
|
1267
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
819
1268
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
820
1269
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
821
1270
|
chatCtx = chatCtx.copy();
|
|
822
1271
|
if (newMessage) {
|
|
823
1272
|
chatCtx.insert(newMessage);
|
|
824
|
-
this.agent._chatCtx.insert(newMessage);
|
|
825
|
-
this.agentSession._conversationItemAdded(newMessage);
|
|
826
1273
|
}
|
|
827
1274
|
if (instructions) {
|
|
828
1275
|
try {
|
|
@@ -835,7 +1282,6 @@ ${instructions}` : instructions,
|
|
|
835
1282
|
this.logger.error({ error: e }, "error occurred during updateInstructions");
|
|
836
1283
|
}
|
|
837
1284
|
}
|
|
838
|
-
this.agentSession._updateAgentState("thinking");
|
|
839
1285
|
const tasks = [];
|
|
840
1286
|
const [llmTask, llmGenData] = (0, import_generation.performLLMInference)(
|
|
841
1287
|
// preserve `this` context in llmNode
|
|
@@ -843,22 +1289,36 @@ ${instructions}` : instructions,
|
|
|
843
1289
|
chatCtx,
|
|
844
1290
|
toolCtx,
|
|
845
1291
|
modelSettings,
|
|
846
|
-
replyAbortController
|
|
1292
|
+
replyAbortController,
|
|
1293
|
+
(_b = this.llm) == null ? void 0 : _b.model,
|
|
1294
|
+
(_c = this.llm) == null ? void 0 : _c.provider
|
|
847
1295
|
);
|
|
848
1296
|
tasks.push(llmTask);
|
|
849
|
-
const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
|
|
850
1297
|
let ttsTask = null;
|
|
851
|
-
let
|
|
1298
|
+
let ttsGenData = null;
|
|
1299
|
+
let llmOutput;
|
|
852
1300
|
if (audioOutput) {
|
|
853
|
-
[
|
|
1301
|
+
const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
|
|
1302
|
+
llmOutput = textOutput;
|
|
1303
|
+
[ttsTask, ttsGenData] = (0, import_generation.performTTSInference)(
|
|
854
1304
|
(...args) => this.agent.ttsNode(...args),
|
|
855
1305
|
ttsTextInput,
|
|
856
1306
|
modelSettings,
|
|
857
|
-
replyAbortController
|
|
1307
|
+
replyAbortController,
|
|
1308
|
+
(_d = this.tts) == null ? void 0 : _d.model,
|
|
1309
|
+
(_e = this.tts) == null ? void 0 : _e.provider
|
|
858
1310
|
);
|
|
859
1311
|
tasks.push(ttsTask);
|
|
1312
|
+
} else {
|
|
1313
|
+
llmOutput = llmGenData.textStream;
|
|
860
1314
|
}
|
|
861
1315
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1316
|
+
let userMetrics = _previousUserMetrics;
|
|
1317
|
+
if (newMessage && speechHandle.scheduled) {
|
|
1318
|
+
this.agent._chatCtx.insert(newMessage);
|
|
1319
|
+
this.agentSession._conversationItemAdded(newMessage);
|
|
1320
|
+
userMetrics = newMessage.metrics;
|
|
1321
|
+
}
|
|
862
1322
|
if (speechHandle.interrupted) {
|
|
863
1323
|
replyAbortController.abort();
|
|
864
1324
|
await (0, import_utils.cancelAndWait)(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
@@ -868,7 +1328,20 @@ ${instructions}` : instructions,
|
|
|
868
1328
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
869
1329
|
speechHandle._clearAuthorization();
|
|
870
1330
|
const replyStartedAt = Date.now();
|
|
871
|
-
|
|
1331
|
+
let transcriptionInput = llmOutput;
|
|
1332
|
+
if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
|
|
1333
|
+
const timedTextsStream = await Promise.race([
|
|
1334
|
+
ttsGenData.timedTextsFut.await,
|
|
1335
|
+
(ttsTask == null ? void 0 : ttsTask.result.catch(
|
|
1336
|
+
() => this.logger.warn("TTS task failed before resolving timedTextsFut")
|
|
1337
|
+
)) ?? Promise.resolve()
|
|
1338
|
+
]);
|
|
1339
|
+
if (timedTextsStream) {
|
|
1340
|
+
this.logger.debug("Using TTS aligned transcripts for transcription node input");
|
|
1341
|
+
transcriptionInput = timedTextsStream;
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
|
|
872
1345
|
let textOut = null;
|
|
873
1346
|
if (trNodeResult) {
|
|
874
1347
|
const [textForwardTask, _textOut] = (0, import_generation.performTextForwarding)(
|
|
@@ -879,29 +1352,44 @@ ${instructions}` : instructions,
|
|
|
879
1352
|
tasks.push(textForwardTask);
|
|
880
1353
|
textOut = _textOut;
|
|
881
1354
|
}
|
|
882
|
-
|
|
883
|
-
|
|
1355
|
+
let agentStartedSpeakingAt;
|
|
1356
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1357
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1358
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1359
|
+
startTime: startedSpeakingAt,
|
|
1360
|
+
otelContext: speechHandle._agentTurnContext
|
|
1361
|
+
});
|
|
1362
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1363
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1364
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1365
|
+
}
|
|
884
1366
|
};
|
|
885
1367
|
let audioOut = null;
|
|
886
1368
|
if (audioOutput) {
|
|
887
|
-
if (
|
|
1369
|
+
if (ttsGenData) {
|
|
888
1370
|
const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
|
|
889
|
-
|
|
1371
|
+
ttsGenData.audioStream,
|
|
890
1372
|
audioOutput,
|
|
891
1373
|
replyAbortController
|
|
892
1374
|
);
|
|
893
1375
|
audioOut = _audioOut;
|
|
894
1376
|
tasks.push(forwardTask);
|
|
895
|
-
audioOut.firstFrameFut.await.
|
|
1377
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
896
1378
|
} else {
|
|
897
|
-
throw Error("
|
|
1379
|
+
throw Error("ttsGenData is null when audioOutput is enabled");
|
|
898
1380
|
}
|
|
899
1381
|
} else {
|
|
900
|
-
textOut == null ? void 0 : textOut.firstTextFut.await.
|
|
1382
|
+
textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
901
1383
|
}
|
|
902
|
-
const onToolExecutionStarted = (
|
|
1384
|
+
const onToolExecutionStarted = (f) => {
|
|
1385
|
+
speechHandle._itemAdded([f]);
|
|
1386
|
+
this.agent._chatCtx.items.push(f);
|
|
1387
|
+
this.agentSession._toolItemsAdded([f]);
|
|
903
1388
|
};
|
|
904
|
-
const onToolExecutionCompleted = (
|
|
1389
|
+
const onToolExecutionCompleted = (out) => {
|
|
1390
|
+
if (out.toolCallOutput) {
|
|
1391
|
+
speechHandle._itemAdded([out.toolCallOutput]);
|
|
1392
|
+
}
|
|
905
1393
|
};
|
|
906
1394
|
const [executeToolsTask, toolOutput] = (0, import_generation.performToolExecutions)({
|
|
907
1395
|
session: this.agentSession,
|
|
@@ -917,28 +1405,53 @@ ${instructions}` : instructions,
|
|
|
917
1405
|
if (audioOutput) {
|
|
918
1406
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
919
1407
|
}
|
|
1408
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1409
|
+
const assistantMetrics = {};
|
|
1410
|
+
if (llmGenData.ttft !== void 0) {
|
|
1411
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft;
|
|
1412
|
+
}
|
|
1413
|
+
if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
|
|
1414
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
|
|
1415
|
+
}
|
|
1416
|
+
if (agentStartedSpeakingAt !== void 0) {
|
|
1417
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
|
|
1418
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
|
|
1419
|
+
if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
|
|
1420
|
+
const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
|
|
1421
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1422
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1423
|
+
}
|
|
1424
|
+
}
|
|
1425
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1426
|
+
let hasSpeechMessage = false;
|
|
920
1427
|
if (toolsMessages) {
|
|
921
1428
|
for (const msg of toolsMessages) {
|
|
922
1429
|
msg.createdAt = replyStartedAt;
|
|
923
1430
|
}
|
|
924
|
-
|
|
1431
|
+
const toolCallOutputs = toolsMessages.filter(
|
|
1432
|
+
(m) => m.type === "function_call_output"
|
|
1433
|
+
);
|
|
1434
|
+
if (toolCallOutputs.length > 0) {
|
|
1435
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1436
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1437
|
+
}
|
|
925
1438
|
}
|
|
926
1439
|
if (speechHandle.interrupted) {
|
|
927
1440
|
this.logger.debug(
|
|
928
1441
|
{ speech_id: speechHandle.id },
|
|
929
1442
|
"Aborting all pipeline reply tasks due to interruption"
|
|
930
1443
|
);
|
|
1444
|
+
if (audioOutput) {
|
|
1445
|
+
audioOutput.clearBuffer();
|
|
1446
|
+
}
|
|
931
1447
|
replyAbortController.abort();
|
|
932
|
-
await
|
|
933
|
-
tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT))
|
|
934
|
-
);
|
|
1448
|
+
await (0, import_utils.cancelAndWait)(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
935
1449
|
let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
|
|
936
1450
|
if (audioOutput) {
|
|
937
|
-
audioOutput.clearBuffer();
|
|
938
1451
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
939
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1452
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
940
1453
|
this.logger.info(
|
|
941
|
-
{ speech_id: speechHandle.id,
|
|
1454
|
+
{ speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
|
|
942
1455
|
"playout interrupted"
|
|
943
1456
|
);
|
|
944
1457
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -949,19 +1462,27 @@ ${instructions}` : instructions,
|
|
|
949
1462
|
}
|
|
950
1463
|
}
|
|
951
1464
|
if (forwardedText) {
|
|
1465
|
+
hasSpeechMessage = true;
|
|
952
1466
|
const message = import_chat_context.ChatMessage.create({
|
|
953
1467
|
role: "assistant",
|
|
954
1468
|
content: forwardedText,
|
|
955
1469
|
id: llmGenData.id,
|
|
956
1470
|
interrupted: true,
|
|
957
|
-
createdAt: replyStartedAt
|
|
1471
|
+
createdAt: replyStartedAt,
|
|
1472
|
+
metrics: assistantMetrics
|
|
958
1473
|
});
|
|
959
1474
|
chatCtx.insert(message);
|
|
960
1475
|
this.agent._chatCtx.insert(message);
|
|
1476
|
+
speechHandle._itemAdded([message]);
|
|
961
1477
|
this.agentSession._conversationItemAdded(message);
|
|
1478
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
962
1479
|
}
|
|
963
1480
|
if (this.agentSession.agentState === "speaking") {
|
|
964
1481
|
this.agentSession._updateAgentState("listening");
|
|
1482
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1483
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1484
|
+
this.restoreInterruptionByAudioActivity();
|
|
1485
|
+
}
|
|
965
1486
|
}
|
|
966
1487
|
this.logger.info(
|
|
967
1488
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
@@ -972,16 +1493,20 @@ ${instructions}` : instructions,
|
|
|
972
1493
|
return;
|
|
973
1494
|
}
|
|
974
1495
|
if (textOut && textOut.text) {
|
|
1496
|
+
hasSpeechMessage = true;
|
|
975
1497
|
const message = import_chat_context.ChatMessage.create({
|
|
976
1498
|
role: "assistant",
|
|
977
1499
|
id: llmGenData.id,
|
|
978
1500
|
interrupted: false,
|
|
979
1501
|
createdAt: replyStartedAt,
|
|
980
|
-
content: textOut.text
|
|
1502
|
+
content: textOut.text,
|
|
1503
|
+
metrics: assistantMetrics
|
|
981
1504
|
});
|
|
982
1505
|
chatCtx.insert(message);
|
|
983
1506
|
this.agent._chatCtx.insert(message);
|
|
1507
|
+
speechHandle._itemAdded([message]);
|
|
984
1508
|
this.agentSession._conversationItemAdded(message);
|
|
1509
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
985
1510
|
this.logger.info(
|
|
986
1511
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
987
1512
|
"playout completed without interruption"
|
|
@@ -991,11 +1516,17 @@ ${instructions}` : instructions,
|
|
|
991
1516
|
this.agentSession._updateAgentState("thinking");
|
|
992
1517
|
} else if (this.agentSession.agentState === "speaking") {
|
|
993
1518
|
this.agentSession._updateAgentState("listening");
|
|
1519
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1520
|
+
{
|
|
1521
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1522
|
+
this.restoreInterruptionByAudioActivity();
|
|
1523
|
+
}
|
|
1524
|
+
}
|
|
994
1525
|
}
|
|
995
1526
|
speechHandle._markGenerationDone();
|
|
996
1527
|
await executeToolsTask.result;
|
|
997
1528
|
if (toolOutput.output.length === 0) return;
|
|
998
|
-
const { maxToolSteps } = this.agentSession.
|
|
1529
|
+
const { maxToolSteps } = this.agentSession.sessionOptions;
|
|
999
1530
|
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1000
1531
|
this.logger.warn(
|
|
1001
1532
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
@@ -1003,45 +1534,15 @@ ${instructions}` : instructions,
|
|
|
1003
1534
|
);
|
|
1004
1535
|
return;
|
|
1005
1536
|
}
|
|
1006
|
-
const functionToolsExecutedEvent = (
|
|
1007
|
-
functionCalls: [],
|
|
1008
|
-
functionCallOutputs: []
|
|
1009
|
-
});
|
|
1010
|
-
let shouldGenerateToolReply = false;
|
|
1011
|
-
let newAgentTask = null;
|
|
1012
|
-
let ignoreTaskSwitch = false;
|
|
1013
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1014
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1015
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1016
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1017
|
-
if (sanitizedOut.replyRequired) {
|
|
1018
|
-
shouldGenerateToolReply = true;
|
|
1019
|
-
}
|
|
1020
|
-
}
|
|
1021
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1022
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1023
|
-
ignoreTaskSwitch = true;
|
|
1024
|
-
}
|
|
1025
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1026
|
-
this.logger.debug(
|
|
1027
|
-
{
|
|
1028
|
-
speechId: speechHandle.id,
|
|
1029
|
-
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1030
|
-
args: sanitizedOut.toolCall.args,
|
|
1031
|
-
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1032
|
-
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1033
|
-
},
|
|
1034
|
-
"Tool call execution finished"
|
|
1035
|
-
);
|
|
1036
|
-
}
|
|
1537
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1037
1538
|
this.agentSession.emit(
|
|
1038
1539
|
import_events.AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1039
1540
|
functionToolsExecutedEvent
|
|
1040
1541
|
);
|
|
1041
|
-
let
|
|
1542
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1042
1543
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1043
1544
|
this.agentSession.updateAgent(newAgentTask);
|
|
1044
|
-
|
|
1545
|
+
schedulingPaused = true;
|
|
1045
1546
|
}
|
|
1046
1547
|
const toolMessages = [
|
|
1047
1548
|
...functionToolsExecutedEvent.functionCalls,
|
|
@@ -1049,54 +1550,96 @@ ${instructions}` : instructions,
|
|
|
1049
1550
|
];
|
|
1050
1551
|
if (shouldGenerateToolReply) {
|
|
1051
1552
|
chatCtx.insert(toolMessages);
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
stepIndex: speechHandle._stepIndex + 1,
|
|
1055
|
-
parent: speechHandle
|
|
1056
|
-
});
|
|
1057
|
-
this.agentSession.emit(
|
|
1058
|
-
import_events.AgentSessionEventTypes.SpeechCreated,
|
|
1059
|
-
(0, import_events.createSpeechCreatedEvent)({
|
|
1060
|
-
userInitiated: false,
|
|
1061
|
-
source: "tool_response",
|
|
1062
|
-
speechHandle: handle
|
|
1063
|
-
})
|
|
1064
|
-
);
|
|
1065
|
-
const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1553
|
+
speechHandle._numSteps += 1;
|
|
1554
|
+
const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1066
1555
|
const toolResponseTask = this.createSpeechTask({
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
)
|
|
1556
|
+
taskFn: () => this.pipelineReplyTask(
|
|
1557
|
+
speechHandle,
|
|
1558
|
+
chatCtx,
|
|
1559
|
+
toolCtx,
|
|
1560
|
+
{ toolChoice: respondToolChoice },
|
|
1561
|
+
replyAbortController,
|
|
1562
|
+
instructions,
|
|
1563
|
+
void 0,
|
|
1564
|
+
toolMessages,
|
|
1565
|
+
hasSpeechMessage ? void 0 : userMetrics
|
|
1078
1566
|
),
|
|
1079
|
-
ownedSpeechHandle:
|
|
1567
|
+
ownedSpeechHandle: speechHandle,
|
|
1080
1568
|
name: "AgentActivity.pipelineReply"
|
|
1081
1569
|
});
|
|
1082
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1083
|
-
this.scheduleSpeech(
|
|
1570
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1571
|
+
this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1084
1572
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1085
1573
|
for (const msg of toolMessages) {
|
|
1086
1574
|
msg.createdAt = replyStartedAt;
|
|
1087
1575
|
}
|
|
1088
|
-
|
|
1576
|
+
const toolCallOutputs = toolMessages.filter(
|
|
1577
|
+
(m) => m.type === "function_call_output"
|
|
1578
|
+
);
|
|
1579
|
+
if (toolCallOutputs.length > 0) {
|
|
1580
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1581
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1582
|
+
}
|
|
1089
1583
|
}
|
|
1090
|
-
}
|
|
1584
|
+
};
|
|
1585
|
+
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => import_telemetry.tracer.startActiveSpan(
|
|
1586
|
+
async (span) => this._pipelineReplyTaskImpl({
|
|
1587
|
+
speechHandle,
|
|
1588
|
+
chatCtx,
|
|
1589
|
+
toolCtx,
|
|
1590
|
+
modelSettings,
|
|
1591
|
+
replyAbortController,
|
|
1592
|
+
instructions,
|
|
1593
|
+
newMessage,
|
|
1594
|
+
toolsMessages,
|
|
1595
|
+
span,
|
|
1596
|
+
_previousUserMetrics
|
|
1597
|
+
}),
|
|
1598
|
+
{
|
|
1599
|
+
name: "agent_turn",
|
|
1600
|
+
context: this.agentSession.rootSpanContext
|
|
1601
|
+
}
|
|
1602
|
+
);
|
|
1091
1603
|
async realtimeGenerationTask(speechHandle, ev, modelSettings, replyAbortController) {
|
|
1092
|
-
|
|
1093
|
-
|
|
1604
|
+
return import_telemetry.tracer.startActiveSpan(
|
|
1605
|
+
async (span) => this._realtimeGenerationTaskImpl({
|
|
1606
|
+
speechHandle,
|
|
1607
|
+
ev,
|
|
1608
|
+
modelSettings,
|
|
1609
|
+
replyAbortController,
|
|
1610
|
+
span
|
|
1611
|
+
}),
|
|
1612
|
+
{
|
|
1613
|
+
name: "agent_turn",
|
|
1614
|
+
context: this.agentSession.rootSpanContext
|
|
1615
|
+
}
|
|
1616
|
+
);
|
|
1617
|
+
}
|
|
1618
|
+
async _realtimeGenerationTaskImpl({
|
|
1619
|
+
speechHandle,
|
|
1620
|
+
ev,
|
|
1621
|
+
modelSettings,
|
|
1622
|
+
replyAbortController,
|
|
1623
|
+
span
|
|
1624
|
+
}) {
|
|
1625
|
+
var _a;
|
|
1626
|
+
speechHandle._agentTurnContext = import_api.context.active();
|
|
1627
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1628
|
+
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1629
|
+
if (localParticipant) {
|
|
1630
|
+
(0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
|
|
1631
|
+
}
|
|
1632
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
1094
1633
|
if (!this.realtimeSession) {
|
|
1095
1634
|
throw new Error("realtime session is not initialized");
|
|
1096
1635
|
}
|
|
1097
1636
|
if (!(this.llm instanceof import_llm.RealtimeModel)) {
|
|
1098
1637
|
throw new Error("llm is not a realtime model");
|
|
1099
1638
|
}
|
|
1639
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.llm.model);
|
|
1640
|
+
if (this.realtimeSpans && ev.responseId) {
|
|
1641
|
+
this.realtimeSpans.set(ev.responseId, span);
|
|
1642
|
+
}
|
|
1100
1643
|
this.logger.debug(
|
|
1101
1644
|
{ speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
|
|
1102
1645
|
"realtime generation started"
|
|
@@ -1109,10 +1652,17 @@ ${instructions}` : instructions,
|
|
|
1109
1652
|
if (speechHandle.interrupted) {
|
|
1110
1653
|
return;
|
|
1111
1654
|
}
|
|
1112
|
-
const onFirstFrame = () => {
|
|
1113
|
-
this.agentSession._updateAgentState("speaking"
|
|
1655
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1656
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1657
|
+
startTime: startedSpeakingAt,
|
|
1658
|
+
otelContext: speechHandle._agentTurnContext
|
|
1659
|
+
});
|
|
1114
1660
|
};
|
|
1115
1661
|
const readMessages = async (abortController, outputs) => {
|
|
1662
|
+
var _a2, _b;
|
|
1663
|
+
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
1664
|
+
once: true
|
|
1665
|
+
});
|
|
1116
1666
|
const forwardTasks = [];
|
|
1117
1667
|
try {
|
|
1118
1668
|
for await (const msg of ev.messageStream) {
|
|
@@ -1122,7 +1672,22 @@ ${instructions}` : instructions,
|
|
|
1122
1672
|
);
|
|
1123
1673
|
break;
|
|
1124
1674
|
}
|
|
1125
|
-
const
|
|
1675
|
+
const msgModalities = msg.modalities ? await msg.modalities : void 0;
|
|
1676
|
+
let ttsTextInput = null;
|
|
1677
|
+
let trTextInput;
|
|
1678
|
+
if (msgModalities && !msgModalities.includes("audio") && this.tts) {
|
|
1679
|
+
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1680
|
+
this.logger.warn(
|
|
1681
|
+
"text response received from realtime API, falling back to use a TTS model."
|
|
1682
|
+
);
|
|
1683
|
+
}
|
|
1684
|
+
const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
|
|
1685
|
+
ttsTextInput = _ttsTextInput;
|
|
1686
|
+
trTextInput = _trTextInput;
|
|
1687
|
+
} else {
|
|
1688
|
+
trTextInput = msg.textStream;
|
|
1689
|
+
}
|
|
1690
|
+
const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
|
|
1126
1691
|
let textOut = null;
|
|
1127
1692
|
if (trNodeResult) {
|
|
1128
1693
|
const [textForwardTask, _textOut] = (0, import_generation.performTextForwarding)(
|
|
@@ -1135,28 +1700,46 @@ ${instructions}` : instructions,
|
|
|
1135
1700
|
}
|
|
1136
1701
|
let audioOut = null;
|
|
1137
1702
|
if (audioOutput) {
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1703
|
+
let realtimeAudioResult = null;
|
|
1704
|
+
if (ttsTextInput) {
|
|
1705
|
+
const [ttsTask, ttsGenData] = (0, import_generation.performTTSInference)(
|
|
1706
|
+
(...args) => this.agent.ttsNode(...args),
|
|
1707
|
+
ttsTextInput,
|
|
1708
|
+
modelSettings,
|
|
1709
|
+
abortController,
|
|
1710
|
+
(_a2 = this.tts) == null ? void 0 : _a2.model,
|
|
1711
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
1712
|
+
);
|
|
1713
|
+
tasks.push(ttsTask);
|
|
1714
|
+
realtimeAudioResult = ttsGenData.audioStream;
|
|
1715
|
+
} else if (msgModalities && msgModalities.includes("audio")) {
|
|
1716
|
+
realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
|
|
1717
|
+
msg.audioStream,
|
|
1718
|
+
modelSettings
|
|
1719
|
+
);
|
|
1720
|
+
} else if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1721
|
+
this.logger.error(
|
|
1722
|
+
"Text message received from Realtime API with audio modality. This usually happens when text chat context is synced to the API. Try to add a TTS model as fallback or use text modality with TTS instead."
|
|
1723
|
+
);
|
|
1724
|
+
} else {
|
|
1725
|
+
this.logger.warn(
|
|
1726
|
+
"audio output is enabled but neither tts nor realtime audio is available"
|
|
1727
|
+
);
|
|
1728
|
+
}
|
|
1729
|
+
if (realtimeAudioResult) {
|
|
1143
1730
|
const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
|
|
1144
|
-
|
|
1731
|
+
realtimeAudioResult,
|
|
1145
1732
|
audioOutput,
|
|
1146
1733
|
abortController
|
|
1147
1734
|
);
|
|
1148
1735
|
forwardTasks.push(forwardTask);
|
|
1149
1736
|
audioOut = _audioOut;
|
|
1150
|
-
audioOut.firstFrameFut.await.
|
|
1151
|
-
} else {
|
|
1152
|
-
this.logger.warn(
|
|
1153
|
-
"audio output is enabled but neither tts nor realtime audio is available"
|
|
1154
|
-
);
|
|
1737
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
1155
1738
|
}
|
|
1156
1739
|
} else if (textOut) {
|
|
1157
|
-
textOut.firstTextFut.await.
|
|
1740
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
1158
1741
|
}
|
|
1159
|
-
outputs.push([msg.messageId, textOut, audioOut]);
|
|
1742
|
+
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1160
1743
|
}
|
|
1161
1744
|
await (0, import_utils.waitFor)(forwardTasks);
|
|
1162
1745
|
} catch (error) {
|
|
@@ -1169,7 +1752,7 @@ ${instructions}` : instructions,
|
|
|
1169
1752
|
const tasks = [
|
|
1170
1753
|
import_utils.Task.from(
|
|
1171
1754
|
(controller) => readMessages(controller, messageOutputs),
|
|
1172
|
-
|
|
1755
|
+
void 0,
|
|
1173
1756
|
"AgentActivity.realtime_generation.read_messages"
|
|
1174
1757
|
)
|
|
1175
1758
|
];
|
|
@@ -1197,6 +1780,8 @@ ${instructions}` : instructions,
|
|
|
1197
1780
|
);
|
|
1198
1781
|
const onToolExecutionStarted = (f) => {
|
|
1199
1782
|
speechHandle._itemAdded([f]);
|
|
1783
|
+
this.agent._chatCtx.items.push(f);
|
|
1784
|
+
this.agentSession._toolItemsAdded([f]);
|
|
1200
1785
|
};
|
|
1201
1786
|
const onToolExecutionCompleted = (out) => {
|
|
1202
1787
|
if (out.toolCallOutput) {
|
|
@@ -1216,7 +1801,6 @@ ${instructions}` : instructions,
|
|
|
1216
1801
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
1217
1802
|
if (audioOutput) {
|
|
1218
1803
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1219
|
-
this.agentSession._updateAgentState("listening");
|
|
1220
1804
|
}
|
|
1221
1805
|
if (speechHandle.interrupted) {
|
|
1222
1806
|
this.logger.debug(
|
|
@@ -1226,15 +1810,15 @@ ${instructions}` : instructions,
|
|
|
1226
1810
|
replyAbortController.abort();
|
|
1227
1811
|
await (0, import_utils.cancelAndWait)(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1228
1812
|
if (messageOutputs.length > 0) {
|
|
1229
|
-
const [msgId, textOut, audioOut] = messageOutputs[0];
|
|
1813
|
+
const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0];
|
|
1230
1814
|
let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
|
|
1231
1815
|
if (audioOutput) {
|
|
1232
1816
|
audioOutput.clearBuffer();
|
|
1233
1817
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1234
|
-
let
|
|
1235
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1818
|
+
let playbackPositionInS = playbackEv.playbackPosition;
|
|
1819
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
1236
1820
|
this.logger.info(
|
|
1237
|
-
{ speech_id: speechHandle.id,
|
|
1821
|
+
{ speech_id: speechHandle.id, playbackPositionInS },
|
|
1238
1822
|
"playout interrupted"
|
|
1239
1823
|
);
|
|
1240
1824
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1242,11 +1826,13 @@ ${instructions}` : instructions,
|
|
|
1242
1826
|
}
|
|
1243
1827
|
} else {
|
|
1244
1828
|
forwardedText = "";
|
|
1245
|
-
|
|
1829
|
+
playbackPositionInS = 0;
|
|
1246
1830
|
}
|
|
1247
1831
|
this.realtimeSession.truncate({
|
|
1248
1832
|
messageId: msgId,
|
|
1249
|
-
audioEndMs: Math.floor(
|
|
1833
|
+
audioEndMs: Math.floor(playbackPositionInS * 1e3),
|
|
1834
|
+
modalities: msgModalities,
|
|
1835
|
+
audioTranscript: forwardedText
|
|
1250
1836
|
});
|
|
1251
1837
|
}
|
|
1252
1838
|
if (forwardedText) {
|
|
@@ -1270,7 +1856,7 @@ ${instructions}` : instructions,
|
|
|
1270
1856
|
return;
|
|
1271
1857
|
}
|
|
1272
1858
|
if (messageOutputs.length > 0) {
|
|
1273
|
-
const [msgId, textOut, _] = messageOutputs[0];
|
|
1859
|
+
const [msgId, textOut, _, __] = messageOutputs[0];
|
|
1274
1860
|
const message = import_chat_context.ChatMessage.create({
|
|
1275
1861
|
role: "assistant",
|
|
1276
1862
|
content: (textOut == null ? void 0 : textOut.text) || "",
|
|
@@ -1282,12 +1868,16 @@ ${instructions}` : instructions,
|
|
|
1282
1868
|
this.agentSession._conversationItemAdded(message);
|
|
1283
1869
|
}
|
|
1284
1870
|
speechHandle._markGenerationDone();
|
|
1285
|
-
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
1286
|
-
this.agentSession._updateAgentState("thinking");
|
|
1287
|
-
});
|
|
1288
1871
|
await executeToolsTask.result;
|
|
1289
|
-
if (toolOutput.output.length
|
|
1290
|
-
|
|
1872
|
+
if (toolOutput.output.length > 0) {
|
|
1873
|
+
this.agentSession._updateAgentState("thinking");
|
|
1874
|
+
} else if (this.agentSession.agentState === "speaking") {
|
|
1875
|
+
this.agentSession._updateAgentState("listening");
|
|
1876
|
+
}
|
|
1877
|
+
if (toolOutput.output.length === 0) {
|
|
1878
|
+
return;
|
|
1879
|
+
}
|
|
1880
|
+
const { maxToolSteps } = this.agentSession.sessionOptions;
|
|
1291
1881
|
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1292
1882
|
this.logger.warn(
|
|
1293
1883
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
@@ -1295,48 +1885,29 @@ ${instructions}` : instructions,
|
|
|
1295
1885
|
);
|
|
1296
1886
|
return;
|
|
1297
1887
|
}
|
|
1298
|
-
const functionToolsExecutedEvent = (
|
|
1299
|
-
functionCalls: [],
|
|
1300
|
-
functionCallOutputs: []
|
|
1301
|
-
});
|
|
1302
|
-
let shouldGenerateToolReply = false;
|
|
1303
|
-
let newAgentTask = null;
|
|
1304
|
-
let ignoreTaskSwitch = false;
|
|
1305
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1306
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1307
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1308
|
-
if (sanitizedOut.replyRequired) {
|
|
1309
|
-
shouldGenerateToolReply = true;
|
|
1310
|
-
}
|
|
1311
|
-
}
|
|
1312
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1313
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1314
|
-
ignoreTaskSwitch = true;
|
|
1315
|
-
}
|
|
1316
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1317
|
-
this.logger.debug(
|
|
1318
|
-
{
|
|
1319
|
-
speechId: speechHandle.id,
|
|
1320
|
-
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1321
|
-
args: sanitizedOut.toolCall.args,
|
|
1322
|
-
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1323
|
-
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1324
|
-
},
|
|
1325
|
-
"Tool call execution finished"
|
|
1326
|
-
);
|
|
1327
|
-
}
|
|
1888
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1328
1889
|
this.agentSession.emit(
|
|
1329
1890
|
import_events.AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1330
1891
|
functionToolsExecutedEvent
|
|
1331
1892
|
);
|
|
1332
|
-
let
|
|
1893
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1333
1894
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1334
1895
|
this.agentSession.updateAgent(newAgentTask);
|
|
1335
|
-
|
|
1896
|
+
schedulingPaused = true;
|
|
1336
1897
|
}
|
|
1337
1898
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1899
|
+
while (this.currentSpeech || this.speechQueue.size() > 0) {
|
|
1900
|
+
if (this.currentSpeech && !this.currentSpeech.done() && this.currentSpeech !== speechHandle) {
|
|
1901
|
+
await this.currentSpeech.waitForPlayout();
|
|
1902
|
+
} else {
|
|
1903
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
1904
|
+
}
|
|
1905
|
+
}
|
|
1338
1906
|
const chatCtx = this.realtimeSession.chatCtx.copy();
|
|
1339
1907
|
chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
|
|
1908
|
+
this.agentSession._toolItemsAdded(
|
|
1909
|
+
functionToolsExecutedEvent.functionCallOutputs
|
|
1910
|
+
);
|
|
1340
1911
|
try {
|
|
1341
1912
|
await this.realtimeSession.updateChatCtx(chatCtx);
|
|
1342
1913
|
} catch (error) {
|
|
@@ -1363,20 +1934,58 @@ ${instructions}` : instructions,
|
|
|
1363
1934
|
speechHandle: replySpeechHandle
|
|
1364
1935
|
})
|
|
1365
1936
|
);
|
|
1366
|
-
const toolChoice =
|
|
1937
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1367
1938
|
this.createSpeechTask({
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
})
|
|
1374
|
-
),
|
|
1939
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
1940
|
+
speechHandle: replySpeechHandle,
|
|
1941
|
+
modelSettings: { toolChoice },
|
|
1942
|
+
abortController
|
|
1943
|
+
}),
|
|
1375
1944
|
ownedSpeechHandle: replySpeechHandle,
|
|
1376
1945
|
name: "AgentActivity.realtime_reply"
|
|
1377
1946
|
});
|
|
1378
1947
|
this.scheduleSpeech(replySpeechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1379
1948
|
}
|
|
1949
|
+
summarizeToolExecutionOutput(toolOutput, speechHandle) {
|
|
1950
|
+
var _a, _b, _c;
|
|
1951
|
+
const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
|
|
1952
|
+
functionCalls: [],
|
|
1953
|
+
functionCallOutputs: []
|
|
1954
|
+
});
|
|
1955
|
+
let shouldGenerateToolReply = false;
|
|
1956
|
+
let newAgentTask = null;
|
|
1957
|
+
let ignoreTaskSwitch = false;
|
|
1958
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
1959
|
+
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1960
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1961
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1962
|
+
if (sanitizedOut.replyRequired) {
|
|
1963
|
+
shouldGenerateToolReply = true;
|
|
1964
|
+
}
|
|
1965
|
+
}
|
|
1966
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1967
|
+
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1968
|
+
ignoreTaskSwitch = true;
|
|
1969
|
+
}
|
|
1970
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1971
|
+
this.logger.debug(
|
|
1972
|
+
{
|
|
1973
|
+
speechId: speechHandle.id,
|
|
1974
|
+
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1975
|
+
args: sanitizedOut.toolCall.args,
|
|
1976
|
+
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1977
|
+
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1978
|
+
},
|
|
1979
|
+
"Tool call execution finished"
|
|
1980
|
+
);
|
|
1981
|
+
}
|
|
1982
|
+
return {
|
|
1983
|
+
functionToolsExecutedEvent,
|
|
1984
|
+
shouldGenerateToolReply,
|
|
1985
|
+
newAgentTask,
|
|
1986
|
+
ignoreTaskSwitch
|
|
1987
|
+
};
|
|
1988
|
+
}
|
|
1380
1989
|
async realtimeReplyTask({
|
|
1381
1990
|
speechHandle,
|
|
1382
1991
|
modelSettings: { toolChoice },
|
|
@@ -1384,7 +1993,7 @@ ${instructions}` : instructions,
|
|
|
1384
1993
|
instructions,
|
|
1385
1994
|
abortController
|
|
1386
1995
|
}) {
|
|
1387
|
-
speechHandleStorage.enterWith(speechHandle);
|
|
1996
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
1388
1997
|
if (!this.realtimeSession) {
|
|
1389
1998
|
throw new Error("realtime session is not available");
|
|
1390
1999
|
}
|
|
@@ -1418,72 +2027,194 @@ ${instructions}` : instructions,
|
|
|
1418
2027
|
}
|
|
1419
2028
|
}
|
|
1420
2029
|
scheduleSpeech(speechHandle, priority, force = false) {
|
|
1421
|
-
if (this.
|
|
1422
|
-
throw new Error("cannot schedule new speech, the
|
|
2030
|
+
if (this.schedulingPaused && !force) {
|
|
2031
|
+
throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
|
|
1423
2032
|
}
|
|
1424
2033
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1425
2034
|
speechHandle._markScheduled();
|
|
1426
2035
|
this.wakeupMainTask();
|
|
1427
2036
|
}
|
|
2037
|
+
async _pauseSchedulingTask(blockedTasks) {
|
|
2038
|
+
if (this._schedulingPaused) return;
|
|
2039
|
+
this._schedulingPaused = true;
|
|
2040
|
+
this._drainBlockedTasks = blockedTasks;
|
|
2041
|
+
this.wakeupMainTask();
|
|
2042
|
+
if (this._mainTask) {
|
|
2043
|
+
await this._mainTask.result;
|
|
2044
|
+
}
|
|
2045
|
+
}
|
|
2046
|
+
_resumeSchedulingTask() {
|
|
2047
|
+
if (!this._schedulingPaused) return;
|
|
2048
|
+
this._schedulingPaused = false;
|
|
2049
|
+
this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
|
|
2050
|
+
}
|
|
2051
|
+
async pause(options = {}) {
|
|
2052
|
+
const { blockedTasks = [] } = options;
|
|
2053
|
+
const unlock = await this.lock.lock();
|
|
2054
|
+
try {
|
|
2055
|
+
const span = import_telemetry.tracer.startSpan({
|
|
2056
|
+
name: "pause_agent_activity",
|
|
2057
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
2058
|
+
});
|
|
2059
|
+
try {
|
|
2060
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
2061
|
+
await this._closeSessionResources();
|
|
2062
|
+
} finally {
|
|
2063
|
+
span.end();
|
|
2064
|
+
}
|
|
2065
|
+
} finally {
|
|
2066
|
+
unlock();
|
|
2067
|
+
}
|
|
2068
|
+
}
|
|
1428
2069
|
async drain() {
|
|
1429
|
-
|
|
2070
|
+
return import_telemetry.tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
2071
|
+
name: "drain_agent_activity",
|
|
2072
|
+
context: import_api.ROOT_CONTEXT
|
|
2073
|
+
});
|
|
2074
|
+
}
|
|
2075
|
+
async _drainImpl(span) {
|
|
2076
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_AGENT_LABEL, this.agent.id);
|
|
1430
2077
|
const unlock = await this.lock.lock();
|
|
1431
2078
|
try {
|
|
1432
|
-
if (this.
|
|
1433
|
-
this.createSpeechTask({
|
|
1434
|
-
|
|
2079
|
+
if (this._schedulingPaused) return;
|
|
2080
|
+
this._onExitTask = this.createSpeechTask({
|
|
2081
|
+
taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
2082
|
+
name: "on_exit",
|
|
2083
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
2084
|
+
}),
|
|
2085
|
+
inlineTask: true,
|
|
1435
2086
|
name: "AgentActivity_onExit"
|
|
1436
2087
|
});
|
|
1437
|
-
this.
|
|
1438
|
-
this.
|
|
1439
|
-
await
|
|
2088
|
+
this.cancelPreemptiveGeneration();
|
|
2089
|
+
await this._onExitTask.result;
|
|
2090
|
+
await this._pauseSchedulingTask([]);
|
|
1440
2091
|
} finally {
|
|
1441
2092
|
unlock();
|
|
1442
2093
|
}
|
|
1443
2094
|
}
|
|
1444
2095
|
async close() {
|
|
1445
|
-
var _a, _b, _c;
|
|
1446
2096
|
const unlock = await this.lock.lock();
|
|
1447
2097
|
try {
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1453
|
-
}
|
|
1454
|
-
if (this.realtimeSession) {
|
|
1455
|
-
this.realtimeSession.off("generation_created", this.onGenerationCreated);
|
|
1456
|
-
this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
|
|
1457
|
-
this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
|
|
1458
|
-
this.realtimeSession.off(
|
|
1459
|
-
"input_audio_transcription_completed",
|
|
1460
|
-
this.onInputAudioTranscriptionCompleted
|
|
1461
|
-
);
|
|
1462
|
-
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
2098
|
+
this.cancelPreemptiveGeneration();
|
|
2099
|
+
await (0, import_utils.cancelAndWait)(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
2100
|
+
if (this._currentSpeech && !this._currentSpeech.done()) {
|
|
2101
|
+
this._currentSpeech._markDone();
|
|
1463
2102
|
}
|
|
1464
|
-
|
|
1465
|
-
|
|
2103
|
+
await this._closeSessionResources();
|
|
2104
|
+
if (this._mainTask) {
|
|
2105
|
+
await this._mainTask.cancelAndWait();
|
|
1466
2106
|
}
|
|
1467
|
-
if (this.
|
|
1468
|
-
this.
|
|
2107
|
+
if (this.interruptionDetector) {
|
|
2108
|
+
this.interruptionDetector.off("overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2109
|
+
this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2110
|
+
this.interruptionDetector.off("error", this.onInterruptionError);
|
|
1469
2111
|
}
|
|
1470
|
-
|
|
1471
|
-
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1472
|
-
}
|
|
1473
|
-
this.detachAudioInput();
|
|
1474
|
-
await ((_a = this.realtimeSession) == null ? void 0 : _a.close());
|
|
1475
|
-
await ((_b = this.audioRecognition) == null ? void 0 : _b.close());
|
|
1476
|
-
await ((_c = this._mainTask) == null ? void 0 : _c.cancelAndWait());
|
|
2112
|
+
this.agent._agentActivity = void 0;
|
|
1477
2113
|
} finally {
|
|
1478
2114
|
unlock();
|
|
1479
2115
|
}
|
|
1480
2116
|
}
|
|
2117
|
+
resolveInterruptionDetector() {
|
|
2118
|
+
var _a, _b;
|
|
2119
|
+
const agentInterruptionDetection = (_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.interruption) == null ? void 0 : _b.mode;
|
|
2120
|
+
const sessionInterruptionDetection = this.agentSession.interruptionDetection;
|
|
2121
|
+
if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof import_llm.RealtimeModel))) {
|
|
2122
|
+
if (agentInterruptionDetection === "adaptive" || sessionInterruptionDetection === "adaptive") {
|
|
2123
|
+
this.logger.warn(
|
|
2124
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
|
|
2125
|
+
);
|
|
2126
|
+
}
|
|
2127
|
+
return void 0;
|
|
2128
|
+
}
|
|
2129
|
+
if (!this.allowInterruptions) {
|
|
2130
|
+
return void 0;
|
|
2131
|
+
}
|
|
2132
|
+
if (agentInterruptionDetection === "vad") {
|
|
2133
|
+
return void 0;
|
|
2134
|
+
}
|
|
2135
|
+
if (sessionInterruptionDetection === "vad") {
|
|
2136
|
+
return void 0;
|
|
2137
|
+
}
|
|
2138
|
+
if (agentInterruptionDetection === void 0 && sessionInterruptionDetection === void 0 && !(0, import_utils.isHosted)() && !(0, import_utils.isDevMode)()) {
|
|
2139
|
+
this.logger.info("adaptive interruption is disabled by default in production mode");
|
|
2140
|
+
return void 0;
|
|
2141
|
+
}
|
|
2142
|
+
try {
|
|
2143
|
+
const detector = new import_interruption_detector.AdaptiveInterruptionDetector();
|
|
2144
|
+
detector.on("overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2145
|
+
detector.on("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2146
|
+
detector.on("error", this.onInterruptionError);
|
|
2147
|
+
return detector;
|
|
2148
|
+
} catch (error) {
|
|
2149
|
+
this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
|
|
2150
|
+
}
|
|
2151
|
+
return void 0;
|
|
2152
|
+
}
|
|
2153
|
+
restoreInterruptionByAudioActivity() {
|
|
2154
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2155
|
+
}
|
|
2156
|
+
fallbackToVadInterruption() {
|
|
2157
|
+
if (!this.isInterruptionDetectionEnabled) return;
|
|
2158
|
+
this.isInterruptionDetectionEnabled = false;
|
|
2159
|
+
this.restoreInterruptionByAudioActivity();
|
|
2160
|
+
if (this.interruptionDetector) {
|
|
2161
|
+
this.interruptionDetector.off("overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2162
|
+
this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2163
|
+
this.interruptionDetector.off("error", this.onInterruptionError);
|
|
2164
|
+
this.interruptionDetector = void 0;
|
|
2165
|
+
}
|
|
2166
|
+
if (this.audioRecognition) {
|
|
2167
|
+
this.audioRecognition.disableInterruptionDetection().catch((err) => {
|
|
2168
|
+
this.logger.warn({ err }, "error while disabling interruption detection");
|
|
2169
|
+
});
|
|
2170
|
+
}
|
|
2171
|
+
this.logger.warn(
|
|
2172
|
+
"adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption"
|
|
2173
|
+
);
|
|
2174
|
+
}
|
|
2175
|
+
async _closeSessionResources() {
|
|
2176
|
+
var _a, _b, _c;
|
|
2177
|
+
if (this.llm instanceof import_llm.LLM) {
|
|
2178
|
+
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
2179
|
+
this.llm.off("error", this.onModelError);
|
|
2180
|
+
}
|
|
2181
|
+
if (this.realtimeSession) {
|
|
2182
|
+
this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
|
|
2183
|
+
this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
2184
|
+
this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
2185
|
+
this.realtimeSession.off(
|
|
2186
|
+
"input_audio_transcription_completed",
|
|
2187
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
2188
|
+
);
|
|
2189
|
+
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
2190
|
+
this.realtimeSession.off("error", this.onModelError);
|
|
2191
|
+
}
|
|
2192
|
+
if (this.stt instanceof import_stt.STT) {
|
|
2193
|
+
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
2194
|
+
this.stt.off("error", this.onModelError);
|
|
2195
|
+
}
|
|
2196
|
+
if (this.tts instanceof import_tts.TTS) {
|
|
2197
|
+
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
2198
|
+
this.tts.off("error", this.onModelError);
|
|
2199
|
+
}
|
|
2200
|
+
if (this.vad instanceof import_vad.VAD) {
|
|
2201
|
+
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
2202
|
+
}
|
|
2203
|
+
this.detachAudioInput();
|
|
2204
|
+
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
2205
|
+
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
2206
|
+
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
2207
|
+
this.realtimeSession = void 0;
|
|
2208
|
+
this.audioRecognition = void 0;
|
|
2209
|
+
}
|
|
1481
2210
|
}
|
|
1482
2211
|
function toOaiToolChoice(toolChoice) {
|
|
1483
2212
|
return toolChoice !== null ? toolChoice : void 0;
|
|
1484
2213
|
}
|
|
1485
2214
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1486
2215
|
0 && (module.exports = {
|
|
1487
|
-
AgentActivity
|
|
2216
|
+
AgentActivity,
|
|
2217
|
+
agentActivityStorage,
|
|
2218
|
+
onEnterStorage
|
|
1488
2219
|
});
|
|
1489
2220
|
//# sourceMappingURL=agent_activity.cjs.map
|