@livekit/agents 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_exceptions.cjs.map +1 -1
- package/dist/_exceptions.d.ts.map +1 -1
- package/dist/_exceptions.js.map +1 -1
- package/dist/audio.cjs +89 -3
- package/dist/audio.cjs.map +1 -1
- package/dist/audio.d.cts +36 -1
- package/dist/audio.d.ts +36 -1
- package/dist/audio.d.ts.map +1 -1
- package/dist/audio.js +76 -2
- package/dist/audio.js.map +1 -1
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +165 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +141 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/cli.cjs +44 -46
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +45 -47
- package/dist/cli.js.map +1 -1
- package/dist/connection_pool.cjs +242 -0
- package/dist/connection_pool.cjs.map +1 -0
- package/dist/connection_pool.d.cts +123 -0
- package/dist/connection_pool.d.ts +123 -0
- package/dist/connection_pool.d.ts.map +1 -0
- package/dist/connection_pool.js +218 -0
- package/dist/connection_pool.js.map +1 -0
- package/dist/connection_pool.test.cjs +256 -0
- package/dist/connection_pool.test.cjs.map +1 -0
- package/dist/connection_pool.test.js +255 -0
- package/dist/connection_pool.test.js.map +1 -0
- package/dist/constants.cjs +30 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +10 -0
- package/dist/constants.d.ts +10 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +20 -0
- package/dist/constants.js.map +1 -1
- package/dist/cpu.cjs +189 -0
- package/dist/cpu.cjs.map +1 -0
- package/dist/cpu.d.cts +24 -0
- package/dist/cpu.d.ts +24 -0
- package/dist/cpu.d.ts.map +1 -0
- package/dist/cpu.js +152 -0
- package/dist/cpu.js.map +1 -0
- package/dist/cpu.test.cjs +227 -0
- package/dist/cpu.test.cjs.map +1 -0
- package/dist/cpu.test.js +204 -0
- package/dist/cpu.test.js.map +1 -0
- package/dist/http_server.cjs +9 -6
- package/dist/http_server.cjs.map +1 -1
- package/dist/http_server.d.cts +5 -1
- package/dist/http_server.d.ts +5 -1
- package/dist/http_server.d.ts.map +1 -1
- package/dist/http_server.js +9 -6
- package/dist/http_server.js.map +1 -1
- package/dist/index.cjs +24 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +15 -11
- package/dist/index.d.ts +15 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +18 -9
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.cjs +70 -2
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +373 -32
- package/dist/inference/api_protos.d.ts +373 -32
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +62 -2
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/index.cjs +8 -0
- package/dist/inference/index.cjs.map +1 -1
- package/dist/inference/index.d.cts +3 -4
- package/dist/inference/index.d.ts +3 -4
- package/dist/inference/index.d.ts.map +1 -1
- package/dist/inference/index.js +18 -3
- package/dist/inference/index.js.map +1 -1
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +163 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +65 -0
- package/dist/inference/interruption/http_transport.d.ts +65 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +137 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +198 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +164 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +347 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +313 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +106 -66
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +65 -43
- package/dist/inference/llm.d.ts +65 -43
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +100 -66
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +319 -170
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +64 -15
- package/dist/inference/stt.d.ts +64 -15
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +319 -170
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/stt.test.cjs +218 -0
- package/dist/inference/stt.test.cjs.map +1 -0
- package/dist/inference/stt.test.js +217 -0
- package/dist/inference/stt.test.js.map +1 -0
- package/dist/inference/tts.cjs +249 -71
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +94 -17
- package/dist/inference/tts.d.ts +94 -17
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +249 -77
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/tts.test.cjs +305 -0
- package/dist/inference/tts.test.cjs.map +1 -0
- package/dist/inference/tts.test.js +304 -0
- package/dist/inference/tts.test.js.map +1 -0
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +14 -1
- package/dist/inference/utils.d.ts +14 -1
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs +6 -3
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
- package/dist/ipc/inference_proc_executor.js +6 -3
- package/dist/ipc/inference_proc_executor.js.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.js +13 -1
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
- package/dist/ipc/job_proc_executor.cjs +6 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.d.ts.map +1 -1
- package/dist/ipc/job_proc_executor.js +6 -1
- package/dist/ipc/job_proc_executor.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +89 -17
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +68 -18
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.cjs +34 -8
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.d.cts +8 -0
- package/dist/ipc/supervised_proc.d.ts +8 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/ipc/supervised_proc.js +34 -8
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/ipc/supervised_proc.test.cjs +145 -0
- package/dist/ipc/supervised_proc.test.cjs.map +1 -0
- package/dist/ipc/supervised_proc.test.js +122 -0
- package/dist/ipc/supervised_proc.test.js.map +1 -0
- package/dist/job.cjs +109 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +14 -0
- package/dist/job.d.ts +14 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +99 -1
- package/dist/job.js.map +1 -1
- package/dist/language.cjs +394 -0
- package/dist/language.cjs.map +1 -0
- package/dist/language.d.cts +15 -0
- package/dist/language.d.ts +15 -0
- package/dist/language.d.ts.map +1 -0
- package/dist/language.js +363 -0
- package/dist/language.js.map +1 -0
- package/dist/language.test.cjs +43 -0
- package/dist/language.test.cjs.map +1 -0
- package/dist/language.test.js +49 -0
- package/dist/language.test.js.map +1 -0
- package/dist/llm/chat_context.cjs +345 -3
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +86 -2
- package/dist/llm/chat_context.d.ts +86 -2
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +344 -3
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +692 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +692 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/fallback_adapter.cjs +280 -0
- package/dist/llm/fallback_adapter.cjs.map +1 -0
- package/dist/llm/fallback_adapter.d.cts +73 -0
- package/dist/llm/fallback_adapter.d.ts +73 -0
- package/dist/llm/fallback_adapter.d.ts.map +1 -0
- package/dist/llm/fallback_adapter.js +256 -0
- package/dist/llm/fallback_adapter.js.map +1 -0
- package/dist/llm/fallback_adapter.test.cjs +176 -0
- package/dist/llm/fallback_adapter.test.cjs.map +1 -0
- package/dist/llm/fallback_adapter.test.js +175 -0
- package/dist/llm/fallback_adapter.test.js.map +1 -0
- package/dist/llm/index.cjs +11 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +4 -3
- package/dist/llm/index.d.ts +4 -3
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +13 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +65 -11
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +13 -2
- package/dist/llm/llm.d.ts +13 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +65 -11
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/google.cjs +6 -2
- package/dist/llm/provider_format/google.cjs.map +1 -1
- package/dist/llm/provider_format/google.d.cts +1 -1
- package/dist/llm/provider_format/google.d.ts +1 -1
- package/dist/llm/provider_format/google.d.ts.map +1 -1
- package/dist/llm/provider_format/google.js +6 -2
- package/dist/llm/provider_format/google.js.map +1 -1
- package/dist/llm/provider_format/google.test.cjs +48 -0
- package/dist/llm/provider_format/google.test.cjs.map +1 -1
- package/dist/llm/provider_format/google.test.js +54 -1
- package/dist/llm/provider_format/google.test.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +2 -2
- package/dist/llm/provider_format/index.d.ts +2 -2
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +126 -24
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +124 -23
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +393 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +400 -2
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +5 -4
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +5 -4
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +15 -1
- package/dist/llm/realtime.d.ts +15 -1
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/remote_chat_context.cjs.map +1 -1
- package/dist/llm/remote_chat_context.d.cts +2 -0
- package/dist/llm/remote_chat_context.d.ts +2 -0
- package/dist/llm/remote_chat_context.d.ts.map +1 -1
- package/dist/llm/remote_chat_context.js.map +1 -1
- package/dist/llm/tool_context.cjs +50 -2
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +47 -11
- package/dist/llm/tool_context.d.ts +47 -11
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +48 -3
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/llm/tool_context.test.cjs +197 -0
- package/dist/llm/tool_context.test.cjs.map +1 -1
- package/dist/llm/tool_context.test.js +175 -0
- package/dist/llm/tool_context.test.js.map +1 -1
- package/dist/llm/utils.cjs +107 -12
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +10 -3
- package/dist/llm/utils.d.ts +10 -3
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +106 -12
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/utils.test.cjs +90 -0
- package/dist/llm/utils.test.cjs.map +1 -1
- package/dist/llm/utils.test.js +98 -2
- package/dist/llm/utils.test.js.map +1 -1
- package/dist/llm/zod-utils.cjs +102 -0
- package/dist/llm/zod-utils.cjs.map +1 -0
- package/dist/llm/zod-utils.d.cts +65 -0
- package/dist/llm/zod-utils.d.ts +65 -0
- package/dist/llm/zod-utils.d.ts.map +1 -0
- package/dist/llm/zod-utils.js +64 -0
- package/dist/llm/zod-utils.js.map +1 -0
- package/dist/llm/zod-utils.test.cjs +472 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -0
- package/dist/llm/zod-utils.test.js +455 -0
- package/dist/llm/zod-utils.test.js.map +1 -0
- package/dist/log.cjs +45 -14
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.cts +8 -1
- package/dist/log.d.ts +8 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +45 -15
- package/dist/log.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +75 -19
- package/dist/metrics/base.d.ts +75 -19
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +5 -2
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +10 -1
- package/dist/metrics/usage_collector.d.ts +10 -1
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +5 -2
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +23 -7
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +23 -7
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +31 -10
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.cts +6 -1
- package/dist/stream/deferred_stream.d.ts +6 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +31 -10
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/deferred_stream.test.cjs +2 -2
- package/dist/stream/deferred_stream.test.cjs.map +1 -1
- package/dist/stream/deferred_stream.test.js +2 -2
- package/dist/stream/deferred_stream.test.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +344 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +343 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/stream/stream_channel.cjs +39 -1
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +5 -2
- package/dist/stream/stream_channel.d.ts +5 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +39 -1
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stream/stream_channel.test.cjs +27 -0
- package/dist/stream/stream_channel.test.cjs.map +1 -1
- package/dist/stream/stream_channel.test.js +27 -0
- package/dist/stream/stream_channel.test.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +24 -9
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.cts +7 -3
- package/dist/stt/stream_adapter.d.ts +7 -3
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +24 -9
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +94 -19
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +68 -5
- package/dist/stt/stt.d.ts +68 -5
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +96 -21
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/index.cjs +72 -0
- package/dist/telemetry/index.cjs.map +1 -0
- package/dist/telemetry/index.d.cts +7 -0
- package/dist/telemetry/index.d.ts +7 -0
- package/dist/telemetry/index.d.ts.map +1 -0
- package/dist/telemetry/index.js +37 -0
- package/dist/telemetry/index.js.map +1 -0
- package/dist/telemetry/logging.cjs +65 -0
- package/dist/telemetry/logging.cjs.map +1 -0
- package/dist/telemetry/logging.d.cts +21 -0
- package/dist/telemetry/logging.d.ts +21 -0
- package/dist/telemetry/logging.d.ts.map +1 -0
- package/dist/telemetry/logging.js +40 -0
- package/dist/telemetry/logging.js.map +1 -0
- package/dist/telemetry/otel_http_exporter.cjs +166 -0
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
- package/dist/telemetry/otel_http_exporter.d.cts +63 -0
- package/dist/telemetry/otel_http_exporter.d.ts +63 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
- package/dist/telemetry/otel_http_exporter.js +142 -0
- package/dist/telemetry/otel_http_exporter.js.map +1 -0
- package/dist/telemetry/pino_otel_transport.cjs +217 -0
- package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
- package/dist/telemetry/pino_otel_transport.d.cts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
- package/dist/telemetry/pino_otel_transport.js +189 -0
- package/dist/telemetry/pino_otel_transport.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +233 -0
- package/dist/telemetry/trace_types.cjs.map +1 -0
- package/dist/telemetry/trace_types.d.cts +74 -0
- package/dist/telemetry/trace_types.d.ts +74 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -0
- package/dist/telemetry/trace_types.js +141 -0
- package/dist/telemetry/trace_types.js.map +1 -0
- package/dist/telemetry/traces.cjs +484 -0
- package/dist/telemetry/traces.cjs.map +1 -0
- package/dist/telemetry/traces.d.cts +116 -0
- package/dist/telemetry/traces.d.ts +116 -0
- package/dist/telemetry/traces.d.ts.map +1 -0
- package/dist/telemetry/traces.js +449 -0
- package/dist/telemetry/traces.js.map +1 -0
- package/dist/telemetry/utils.cjs +86 -0
- package/dist/telemetry/utils.cjs.map +1 -0
- package/dist/telemetry/utils.d.cts +5 -0
- package/dist/telemetry/utils.d.ts +5 -0
- package/dist/telemetry/utils.d.ts.map +1 -0
- package/dist/telemetry/utils.js +51 -0
- package/dist/telemetry/utils.js.map +1 -0
- package/dist/tokenize/basic/sentence.cjs +3 -3
- package/dist/tokenize/basic/sentence.cjs.map +1 -1
- package/dist/tokenize/basic/sentence.js +3 -3
- package/dist/tokenize/basic/sentence.js.map +1 -1
- package/dist/tokenize/tokenizer.test.cjs +3 -1
- package/dist/tokenize/tokenizer.test.cjs.map +1 -1
- package/dist/tokenize/tokenizer.test.js +3 -1
- package/dist/tokenize/tokenizer.test.js.map +1 -1
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.cts +6 -0
- package/dist/transcription.d.ts +6 -0
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js.map +1 -1
- package/dist/tts/fallback_adapter.cjs +472 -0
- package/dist/tts/fallback_adapter.cjs.map +1 -0
- package/dist/tts/fallback_adapter.d.cts +110 -0
- package/dist/tts/fallback_adapter.d.ts +110 -0
- package/dist/tts/fallback_adapter.d.ts.map +1 -0
- package/dist/tts/fallback_adapter.js +448 -0
- package/dist/tts/fallback_adapter.js.map +1 -0
- package/dist/tts/index.cjs +3 -0
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +1 -0
- package/dist/tts/index.d.ts +1 -0
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +2 -0
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +25 -8
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +6 -3
- package/dist/tts/stream_adapter.d.ts +6 -3
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +25 -8
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +189 -57
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +58 -6
- package/dist/tts/tts.d.ts +58 -6
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +191 -59
- package/dist/tts/tts.js.map +1 -1
- package/dist/typed_promise.cjs +48 -0
- package/dist/typed_promise.cjs.map +1 -0
- package/dist/typed_promise.d.cts +24 -0
- package/dist/typed_promise.d.ts +24 -0
- package/dist/typed_promise.d.ts.map +1 -0
- package/dist/typed_promise.js +28 -0
- package/dist/typed_promise.js.map +1 -0
- package/dist/types.cjs +24 -32
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +45 -10
- package/dist/types.d.ts +45 -10
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +20 -30
- package/dist/types.js.map +1 -1
- package/dist/utils.cjs +124 -28
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +41 -1
- package/dist/utils.d.ts +41 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +119 -27
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +73 -1
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +74 -10
- package/dist/utils.test.js.map +1 -1
- package/dist/vad.cjs +35 -15
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +15 -5
- package/dist/vad.d.ts +15 -5
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +35 -15
- package/dist/vad.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +258 -35
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +54 -13
- package/dist/voice/agent.d.ts +54 -13
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +254 -34
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +314 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +316 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +1116 -385
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +72 -11
- package/dist/voice/agent_activity.d.ts +72 -11
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +1119 -383
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_activity.test.cjs +135 -0
- package/dist/voice/agent_activity.test.cjs.map +1 -0
- package/dist/voice/agent_activity.test.js +134 -0
- package/dist/voice/agent_activity.test.js.map +1 -0
- package/dist/voice/agent_session.cjs +550 -90
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +185 -25
- package/dist/voice/agent_session.d.ts +185 -25
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +556 -91
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +605 -46
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +96 -4
- package/dist/voice/audio_recognition.d.ts +96 -4
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +611 -47
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +295 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +299 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/avatar/datastream_io.cjs +7 -1
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +7 -1
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs +367 -0
- package/dist/voice/background_audio.cjs.map +1 -0
- package/dist/voice/background_audio.d.cts +123 -0
- package/dist/voice/background_audio.d.ts +123 -0
- package/dist/voice/background_audio.d.ts.map +1 -0
- package/dist/voice/background_audio.js +343 -0
- package/dist/voice/background_audio.js.map +1 -0
- package/dist/voice/events.cjs +3 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +16 -9
- package/dist/voice/events.d.ts +16 -9
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +3 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +205 -41
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +21 -5
- package/dist/voice/generation.d.ts +21 -5
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +215 -43
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/generation_tools.test.cjs +236 -0
- package/dist/voice/generation_tools.test.cjs.map +1 -0
- package/dist/voice/generation_tools.test.js +235 -0
- package/dist/voice/generation_tools.test.js.map +1 -0
- package/dist/voice/index.cjs +33 -2
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +8 -2
- package/dist/voice/index.d.ts +8 -2
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +19 -2
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/interruption_detection.test.cjs +114 -0
- package/dist/voice/interruption_detection.test.cjs.map +1 -0
- package/dist/voice/interruption_detection.test.js +113 -0
- package/dist/voice/interruption_detection.test.js.map +1 -0
- package/dist/voice/io.cjs +66 -6
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +67 -7
- package/dist/voice/io.d.ts +67 -7
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +62 -5
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/index.cjs +23 -0
- package/dist/voice/recorder_io/index.cjs.map +1 -0
- package/dist/voice/recorder_io/index.d.cts +2 -0
- package/dist/voice/recorder_io/index.d.ts +2 -0
- package/dist/voice/recorder_io/index.d.ts.map +1 -0
- package/dist/voice/recorder_io/index.js +2 -0
- package/dist/voice/recorder_io/index.js.map +1 -0
- package/dist/voice/recorder_io/recorder_io.cjs +607 -0
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
- package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
- package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
- package/dist/voice/recorder_io/recorder_io.js +573 -0
- package/dist/voice/recorder_io/recorder_io.js.map +1 -0
- package/dist/voice/remote_session.cjs +922 -0
- package/dist/voice/remote_session.cjs.map +1 -0
- package/dist/voice/remote_session.d.cts +108 -0
- package/dist/voice/remote_session.d.ts +108 -0
- package/dist/voice/remote_session.d.ts.map +1 -0
- package/dist/voice/remote_session.js +887 -0
- package/dist/voice/remote_session.js.map +1 -0
- package/dist/voice/report.cjs +88 -0
- package/dist/voice/report.cjs.map +1 -0
- package/dist/voice/report.d.cts +49 -0
- package/dist/voice/report.d.ts +49 -0
- package/dist/voice/report.d.ts.map +1 -0
- package/dist/voice/report.js +63 -0
- package/dist/voice/report.js.map +1 -0
- package/dist/voice/report.test.cjs +121 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +120 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/_input.cjs +40 -7
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +5 -2
- package/dist/voice/room_io/_input.d.ts +5 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +41 -8
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +19 -11
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +7 -4
- package/dist/voice/room_io/_output.d.ts +7 -4
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +20 -12
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +33 -6
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +29 -9
- package/dist/voice/room_io/room_io.d.ts +29 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +33 -7
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +22 -4
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +17 -2
- package/dist/voice/speech_handle.d.ts +17 -2
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +21 -4
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +57 -0
- package/dist/voice/testing/index.cjs.map +1 -0
- package/dist/voice/testing/index.d.cts +21 -0
- package/dist/voice/testing/index.d.ts +21 -0
- package/dist/voice/testing/index.d.ts.map +1 -0
- package/dist/voice/testing/index.js +35 -0
- package/dist/voice/testing/index.js.map +1 -0
- package/dist/voice/testing/run_result.cjs +817 -0
- package/dist/voice/testing/run_result.cjs.map +1 -0
- package/dist/voice/testing/run_result.d.cts +385 -0
- package/dist/voice/testing/run_result.d.ts +385 -0
- package/dist/voice/testing/run_result.d.ts.map +1 -0
- package/dist/voice/testing/run_result.js +790 -0
- package/dist/voice/testing/run_result.js.map +1 -0
- package/dist/voice/testing/types.cjs +46 -0
- package/dist/voice/testing/types.cjs.map +1 -0
- package/dist/voice/testing/types.d.cts +83 -0
- package/dist/voice/testing/types.d.ts +83 -0
- package/dist/voice/testing/types.d.ts.map +1 -0
- package/dist/voice/testing/types.js +19 -0
- package/dist/voice/testing/types.js.map +1 -0
- package/dist/voice/transcription/synchronizer.cjs +139 -15
- package/dist/voice/transcription/synchronizer.cjs.map +1 -1
- package/dist/voice/transcription/synchronizer.d.cts +35 -4
- package/dist/voice/transcription/synchronizer.d.ts +35 -4
- package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
- package/dist/voice/transcription/synchronizer.js +143 -16
- package/dist/voice/transcription/synchronizer.js.map +1 -1
- package/dist/voice/transcription/synchronizer.test.cjs +151 -0
- package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
- package/dist/voice/transcription/synchronizer.test.js +150 -0
- package/dist/voice/transcription/synchronizer.test.js.map +1 -0
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +157 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +37 -0
- package/dist/voice/turn_config/utils.d.ts +37 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +131 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +128 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +127 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/dist/worker.cjs +44 -52
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +18 -8
- package/dist/worker.d.ts +18 -8
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +43 -43
- package/dist/worker.js.map +1 -1
- package/package.json +35 -13
- package/resources/NOTICE +2 -0
- package/resources/keyboard-typing.ogg +0 -0
- package/resources/keyboard-typing2.ogg +0 -0
- package/resources/office-ambience.ogg +0 -0
- package/src/_exceptions.ts +5 -0
- package/src/audio.ts +132 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +203 -0
- package/src/cli.ts +57 -66
- package/src/connection_pool.test.ts +346 -0
- package/src/connection_pool.ts +307 -0
- package/src/constants.ts +14 -0
- package/src/cpu.test.ts +239 -0
- package/src/cpu.ts +173 -0
- package/src/http_server.ts +18 -6
- package/src/index.ts +15 -13
- package/src/inference/api_protos.ts +85 -2
- package/src/inference/index.ts +32 -4
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +207 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +204 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +416 -0
- package/src/inference/llm.ts +214 -163
- package/src/inference/stt.test.ts +253 -0
- package/src/inference/stt.ts +449 -208
- package/src/inference/tts.test.ts +354 -0
- package/src/inference/tts.ts +417 -115
- package/src/inference/utils.ts +30 -2
- package/src/ipc/inference_proc_executor.ts +11 -3
- package/src/ipc/inference_proc_lazy_main.ts +13 -1
- package/src/ipc/job_proc_executor.ts +11 -1
- package/src/ipc/job_proc_lazy_main.ts +86 -20
- package/src/ipc/supervised_proc.test.ts +153 -0
- package/src/ipc/supervised_proc.ts +39 -10
- package/src/job.ts +120 -1
- package/src/language.test.ts +62 -0
- package/src/language.ts +380 -0
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
- package/src/llm/chat_context.test.ts +787 -0
- package/src/llm/chat_context.ts +493 -2
- package/src/llm/fallback_adapter.test.ts +238 -0
- package/src/llm/fallback_adapter.ts +394 -0
- package/src/llm/index.ts +13 -0
- package/src/llm/llm.ts +77 -12
- package/src/llm/provider_format/google.test.ts +72 -1
- package/src/llm/provider_format/google.ts +10 -6
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +480 -2
- package/src/llm/provider_format/openai.ts +152 -21
- package/src/llm/provider_format/utils.ts +11 -5
- package/src/llm/realtime.ts +23 -2
- package/src/llm/remote_chat_context.ts +2 -2
- package/src/llm/tool_context.test.ts +210 -1
- package/src/llm/tool_context.ts +115 -17
- package/src/llm/utils.test.ts +103 -2
- package/src/llm/utils.ts +152 -16
- package/src/llm/zod-utils.test.ts +577 -0
- package/src/llm/zod-utils.ts +153 -0
- package/src/log.ts +71 -19
- package/src/metrics/base.ts +78 -19
- package/src/metrics/index.ts +12 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +14 -3
- package/src/metrics/utils.ts +27 -7
- package/src/stream/deferred_stream.test.ts +3 -3
- package/src/stream/deferred_stream.ts +43 -11
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +545 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/stream/stream_channel.test.ts +37 -0
- package/src/stream/stream_channel.ts +43 -3
- package/src/stt/stream_adapter.ts +30 -9
- package/src/stt/stt.ts +140 -23
- package/src/telemetry/index.ts +28 -0
- package/src/telemetry/logging.ts +55 -0
- package/src/telemetry/otel_http_exporter.ts +218 -0
- package/src/telemetry/pino_otel_transport.ts +265 -0
- package/src/telemetry/trace_types.ts +109 -0
- package/src/telemetry/traces.ts +673 -0
- package/src/telemetry/utils.ts +61 -0
- package/src/tokenize/basic/sentence.ts +3 -3
- package/src/tokenize/tokenizer.test.ts +4 -0
- package/src/transcription.ts +6 -0
- package/src/tts/fallback_adapter.ts +586 -0
- package/src/tts/index.ts +1 -0
- package/src/tts/stream_adapter.ts +38 -8
- package/src/tts/tts.ts +245 -62
- package/src/typed_promise.ts +67 -0
- package/src/types.ts +62 -33
- package/src/utils.test.ts +90 -10
- package/src/utils.ts +178 -33
- package/src/vad.ts +42 -18
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +347 -2
- package/src/voice/agent.ts +346 -44
- package/src/voice/agent_activity.test.ts +194 -0
- package/src/voice/agent_activity.ts +1457 -388
- package/src/voice/agent_session.ts +817 -112
- package/src/voice/audio_recognition.ts +845 -70
- package/src/voice/audio_recognition_span.test.ts +341 -0
- package/src/voice/avatar/datastream_io.ts +9 -1
- package/src/voice/background_audio.ts +494 -0
- package/src/voice/events.ts +27 -7
- package/src/voice/generation.ts +310 -56
- package/src/voice/generation_tools.test.ts +268 -0
- package/src/voice/index.ts +17 -3
- package/src/voice/interruption_detection.test.ts +151 -0
- package/src/voice/io.ts +115 -12
- package/src/voice/recorder_io/index.ts +4 -0
- package/src/voice/recorder_io/recorder_io.ts +783 -0
- package/src/voice/remote_session.ts +1083 -0
- package/src/voice/report.test.ts +136 -0
- package/src/voice/report.ts +140 -0
- package/src/voice/room_io/_input.ts +45 -10
- package/src/voice/room_io/_output.ts +26 -14
- package/src/voice/room_io/room_io.ts +67 -22
- package/src/voice/speech_handle.ts +38 -6
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +52 -0
- package/src/voice/testing/run_result.ts +995 -0
- package/src/voice/testing/types.ts +118 -0
- package/src/voice/transcription/synchronizer.test.ts +206 -0
- package/src/voice/transcription/synchronizer.ts +204 -19
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +148 -0
- package/src/voice/turn_config/utils.ts +167 -0
- package/src/voice/utils.ts +29 -0
- package/src/worker.ts +92 -78
- package/src/llm/__snapshots__/utils.test.ts.snap +0 -65
|
@@ -1,73 +1,150 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { ParticipantKind } from '@livekit/rtc-node';
|
|
4
5
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
6
|
+
import {
|
|
7
|
+
type Context,
|
|
8
|
+
ROOT_CONTEXT,
|
|
9
|
+
type Span,
|
|
10
|
+
context as otelContext,
|
|
11
|
+
trace,
|
|
12
|
+
} from '@opentelemetry/api';
|
|
5
13
|
import type { WritableStreamDefaultWriter } from 'node:stream/web';
|
|
6
14
|
import { ReadableStream } from 'node:stream/web';
|
|
15
|
+
import { isAPIError } from '../_exceptions.js';
|
|
16
|
+
import { apiConnectDefaults, intervalForRetry } from '../inference/interruption/defaults.js';
|
|
17
|
+
import { InterruptionDetectionError } from '../inference/interruption/errors.js';
|
|
18
|
+
import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
|
|
19
|
+
import { InterruptionStreamSentinel } from '../inference/interruption/interruption_stream.js';
|
|
20
|
+
import {
|
|
21
|
+
type InterruptionSentinel,
|
|
22
|
+
type OverlappingSpeechEvent,
|
|
23
|
+
} from '../inference/interruption/types.js';
|
|
24
|
+
import type { LanguageCode } from '../language.js';
|
|
7
25
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
8
26
|
import { log } from '../log.js';
|
|
9
27
|
import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';
|
|
10
28
|
import { IdentityTransform } from '../stream/identity_transform.js';
|
|
11
29
|
import { mergeReadableStreams } from '../stream/merge_readable_streams.js';
|
|
30
|
+
import { type StreamChannel, createStreamChannel } from '../stream/stream_channel.js';
|
|
12
31
|
import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
|
|
13
|
-
import {
|
|
32
|
+
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
33
|
+
import { Task, delay, waitForAbort } from '../utils.js';
|
|
14
34
|
import { type VAD, type VADEvent, VADEventType } from '../vad.js';
|
|
15
35
|
import type { TurnDetectionMode } from './agent_session.js';
|
|
16
36
|
import type { STTNode } from './io.js';
|
|
37
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
17
38
|
|
|
18
39
|
export interface EndOfTurnInfo {
|
|
40
|
+
/** The new transcript text from the user's speech. */
|
|
19
41
|
newTranscript: string;
|
|
42
|
+
/** Confidence score of the transcript (0-1). */
|
|
43
|
+
transcriptConfidence: number;
|
|
44
|
+
/** Delay from speech stop to final transcription in milliseconds. */
|
|
20
45
|
transcriptionDelay: number;
|
|
46
|
+
/** Delay from speech stop to end of utterance detection in milliseconds. */
|
|
21
47
|
endOfUtteranceDelay: number;
|
|
48
|
+
/** Timestamp when user started speaking (milliseconds since epoch). */
|
|
49
|
+
startedSpeakingAt: number | undefined;
|
|
50
|
+
/** Timestamp when user stopped speaking (milliseconds since epoch). */
|
|
51
|
+
stoppedSpeakingAt: number | undefined;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface PreemptiveGenerationInfo {
|
|
55
|
+
newTranscript: string;
|
|
56
|
+
transcriptConfidence: number;
|
|
22
57
|
}
|
|
23
58
|
|
|
24
59
|
export interface RecognitionHooks {
|
|
60
|
+
onInterruption: (ev: OverlappingSpeechEvent) => void;
|
|
25
61
|
onStartOfSpeech: (ev: VADEvent) => void;
|
|
26
62
|
onVADInferenceDone: (ev: VADEvent) => void;
|
|
27
63
|
onEndOfSpeech: (ev: VADEvent) => void;
|
|
28
64
|
onInterimTranscript: (ev: SpeechEvent) => void;
|
|
29
65
|
onFinalTranscript: (ev: SpeechEvent) => void;
|
|
30
66
|
onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
|
|
67
|
+
onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
|
|
31
68
|
|
|
32
69
|
retrieveChatCtx: () => ChatContext;
|
|
33
70
|
}
|
|
34
71
|
|
|
35
72
|
export interface _TurnDetector {
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
73
|
+
/** The model name used by this turn detector. */
|
|
74
|
+
readonly model: string;
|
|
75
|
+
/** The provider name for this turn detector. */
|
|
76
|
+
readonly provider: string;
|
|
77
|
+
unlikelyThreshold: (language?: LanguageCode) => Promise<number | undefined>;
|
|
78
|
+
supportsLanguage: (language?: LanguageCode) => Promise<boolean>;
|
|
79
|
+
predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise<number>;
|
|
39
80
|
}
|
|
40
81
|
|
|
41
82
|
export interface AudioRecognitionOptions {
|
|
83
|
+
/** Hooks for recognition events. */
|
|
42
84
|
recognitionHooks: RecognitionHooks;
|
|
85
|
+
/** Speech-to-text node. */
|
|
43
86
|
stt?: STTNode;
|
|
87
|
+
/** Voice activity detection. */
|
|
44
88
|
vad?: VAD;
|
|
89
|
+
/** Turn detector for end-of-turn prediction. */
|
|
45
90
|
turnDetector?: _TurnDetector;
|
|
46
|
-
|
|
91
|
+
/** Turn detection mode. */
|
|
92
|
+
turnDetectionMode?: TurnDetectionMode;
|
|
93
|
+
interruptionDetection?: AdaptiveInterruptionDetector;
|
|
94
|
+
/** Minimum endpointing delay in milliseconds. */
|
|
47
95
|
minEndpointingDelay: number;
|
|
96
|
+
/** Maximum endpointing delay in milliseconds. */
|
|
48
97
|
maxEndpointingDelay: number;
|
|
98
|
+
/** Root span context for tracing. */
|
|
99
|
+
rootSpanContext?: Context;
|
|
100
|
+
/** STT model name for tracing */
|
|
101
|
+
sttModel?: string;
|
|
102
|
+
/** STT provider name for tracing */
|
|
103
|
+
sttProvider?: string;
|
|
104
|
+
/** Getter for linked participant for span attribution */
|
|
105
|
+
getLinkedParticipant?: () => ParticipantLike | undefined;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Minimal participant shape for span attribution.
|
|
110
|
+
* Compatible with both `LocalParticipant` and `RemoteParticipant` from `@livekit/rtc-node`.
|
|
111
|
+
*/
|
|
112
|
+
export interface ParticipantLike {
|
|
113
|
+
sid: string | undefined;
|
|
114
|
+
identity: string;
|
|
115
|
+
kind: ParticipantKind;
|
|
49
116
|
}
|
|
50
117
|
|
|
118
|
+
// TODO add ability to update stt/vad/interruption-detection
|
|
51
119
|
export class AudioRecognition {
|
|
52
120
|
private hooks: RecognitionHooks;
|
|
53
121
|
private stt?: STTNode;
|
|
54
122
|
private vad?: VAD;
|
|
55
123
|
private turnDetector?: _TurnDetector;
|
|
56
|
-
private turnDetectionMode?:
|
|
124
|
+
private turnDetectionMode?: TurnDetectionMode;
|
|
57
125
|
private minEndpointingDelay: number;
|
|
58
126
|
private maxEndpointingDelay: number;
|
|
59
|
-
private lastLanguage?:
|
|
127
|
+
private lastLanguage?: LanguageCode;
|
|
128
|
+
private rootSpanContext?: Context;
|
|
129
|
+
private sttModel?: string;
|
|
130
|
+
private sttProvider?: string;
|
|
131
|
+
private getLinkedParticipant?: () => ParticipantLike | undefined;
|
|
60
132
|
|
|
61
133
|
private deferredInputStream: DeferredReadableStream<AudioFrame>;
|
|
62
134
|
private logger = log();
|
|
63
135
|
private lastFinalTranscriptTime = 0;
|
|
64
136
|
private audioTranscript = '';
|
|
65
137
|
private audioInterimTranscript = '';
|
|
66
|
-
private
|
|
138
|
+
private audioPreflightTranscript = '';
|
|
139
|
+
private finalTranscriptConfidence: number[] = [];
|
|
140
|
+
private lastSpeakingTime: number | undefined;
|
|
141
|
+
private speechStartTime: number | undefined;
|
|
67
142
|
private userTurnCommitted = false;
|
|
68
143
|
private speaking = false;
|
|
69
144
|
private sampleRate?: number;
|
|
70
145
|
|
|
146
|
+
private userTurnSpan?: Span;
|
|
147
|
+
|
|
71
148
|
private vadInputStream: ReadableStream<AudioFrame>;
|
|
72
149
|
private sttInputStream: ReadableStream<AudioFrame>;
|
|
73
150
|
private silenceAudioTransform = new IdentityTransform<AudioFrame>();
|
|
@@ -78,6 +155,16 @@ export class AudioRecognition {
|
|
|
78
155
|
private commitUserTurnTask?: Task<void>;
|
|
79
156
|
private vadTask?: Task<void>;
|
|
80
157
|
private sttTask?: Task<void>;
|
|
158
|
+
private interruptionTask?: Task<void>;
|
|
159
|
+
|
|
160
|
+
// interruption detection
|
|
161
|
+
private interruptionDetection?: AdaptiveInterruptionDetector;
|
|
162
|
+
private _inputStartedAt?: number;
|
|
163
|
+
private ignoreUserTranscriptUntil?: number;
|
|
164
|
+
private transcriptBuffer: SpeechEvent[];
|
|
165
|
+
private isInterruptionEnabled: boolean;
|
|
166
|
+
private isAgentSpeaking: boolean;
|
|
167
|
+
private interruptionStreamChannel?: StreamChannel<InterruptionSentinel | AudioFrame>;
|
|
81
168
|
|
|
82
169
|
constructor(opts: AudioRecognitionOptions) {
|
|
83
170
|
this.hooks = opts.recognitionHooks;
|
|
@@ -88,11 +175,35 @@ export class AudioRecognition {
|
|
|
88
175
|
this.minEndpointingDelay = opts.minEndpointingDelay;
|
|
89
176
|
this.maxEndpointingDelay = opts.maxEndpointingDelay;
|
|
90
177
|
this.lastLanguage = undefined;
|
|
178
|
+
this.rootSpanContext = opts.rootSpanContext;
|
|
179
|
+
this.sttModel = opts.sttModel;
|
|
180
|
+
this.sttProvider = opts.sttProvider;
|
|
181
|
+
this.getLinkedParticipant = opts.getLinkedParticipant;
|
|
91
182
|
|
|
92
183
|
this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
|
|
93
|
-
|
|
94
|
-
this.
|
|
95
|
-
this.
|
|
184
|
+
this.interruptionDetection = opts.interruptionDetection;
|
|
185
|
+
this.transcriptBuffer = [];
|
|
186
|
+
this.isInterruptionEnabled = !!(opts.interruptionDetection && opts.vad);
|
|
187
|
+
this.isAgentSpeaking = false;
|
|
188
|
+
|
|
189
|
+
if (opts.interruptionDetection) {
|
|
190
|
+
const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee();
|
|
191
|
+
const [inputStream, sttInputStream] = teedInput.tee();
|
|
192
|
+
this.vadInputStream = vadInputStream;
|
|
193
|
+
this.sttInputStream = mergeReadableStreams(
|
|
194
|
+
sttInputStream,
|
|
195
|
+
this.silenceAudioTransform.readable,
|
|
196
|
+
);
|
|
197
|
+
this.interruptionStreamChannel = createStreamChannel();
|
|
198
|
+
this.interruptionStreamChannel.addStreamInput(inputStream);
|
|
199
|
+
} else {
|
|
200
|
+
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
201
|
+
this.vadInputStream = vadInputStream;
|
|
202
|
+
this.sttInputStream = mergeReadableStreams(
|
|
203
|
+
sttInputStream,
|
|
204
|
+
this.silenceAudioTransform.readable,
|
|
205
|
+
);
|
|
206
|
+
}
|
|
96
207
|
this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();
|
|
97
208
|
}
|
|
98
209
|
|
|
@@ -106,6 +217,16 @@ export class AudioRecognition {
|
|
|
106
217
|
return this.audioTranscript;
|
|
107
218
|
}
|
|
108
219
|
|
|
220
|
+
/** @internal */
|
|
221
|
+
get inputStartedAt() {
|
|
222
|
+
return this._inputStartedAt;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/** @internal */
|
|
226
|
+
updateOptions(options: { turnDetection: TurnDetectionMode | undefined }): void {
|
|
227
|
+
this.turnDetectionMode = options.turnDetection;
|
|
228
|
+
}
|
|
229
|
+
|
|
109
230
|
async start() {
|
|
110
231
|
this.vadTask = Task.from(({ signal }) => this.createVadTask(this.vad, signal));
|
|
111
232
|
this.vadTask.result.catch((err) => {
|
|
@@ -116,6 +237,251 @@ export class AudioRecognition {
|
|
|
116
237
|
this.sttTask.result.catch((err) => {
|
|
117
238
|
this.logger.error(`Error running STT task: ${err}`);
|
|
118
239
|
});
|
|
240
|
+
|
|
241
|
+
this.interruptionTask = Task.from(({ signal }) =>
|
|
242
|
+
this.createInterruptionTask(this.interruptionDetection, signal),
|
|
243
|
+
);
|
|
244
|
+
this.interruptionTask.result.catch((err) => {
|
|
245
|
+
this.logger.error(`Error running interruption task: ${err}`);
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
async stop() {
|
|
250
|
+
await this.sttTask?.cancelAndWait();
|
|
251
|
+
await this.vadTask?.cancelAndWait();
|
|
252
|
+
await this.interruptionTask?.cancelAndWait();
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
async disableInterruptionDetection(): Promise<void> {
|
|
256
|
+
this.isInterruptionEnabled = false;
|
|
257
|
+
this.interruptionDetection = undefined;
|
|
258
|
+
await this.interruptionTask?.cancelAndWait();
|
|
259
|
+
this.interruptionTask = undefined;
|
|
260
|
+
await this.interruptionStreamChannel?.close();
|
|
261
|
+
this.interruptionStreamChannel = undefined;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
async onStartOfAgentSpeech() {
|
|
265
|
+
this.isAgentSpeaking = true;
|
|
266
|
+
return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted());
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) {
|
|
270
|
+
if (!this.isInterruptionEnabled) {
|
|
271
|
+
this.isAgentSpeaking = false;
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const inputOpen = await this.trySendInterruptionSentinel(
|
|
276
|
+
InterruptionStreamSentinel.agentSpeechEnded(),
|
|
277
|
+
);
|
|
278
|
+
if (!inputOpen) {
|
|
279
|
+
this.isAgentSpeaking = false;
|
|
280
|
+
return;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if (this.isAgentSpeaking) {
|
|
284
|
+
if (this.ignoreUserTranscriptUntil === undefined) {
|
|
285
|
+
this.onEndOfOverlapSpeech(Date.now());
|
|
286
|
+
}
|
|
287
|
+
this.ignoreUserTranscriptUntil = this.ignoreUserTranscriptUntil
|
|
288
|
+
? Math.min(ignoreUserTranscriptUntil, this.ignoreUserTranscriptUntil)
|
|
289
|
+
: ignoreUserTranscriptUntil;
|
|
290
|
+
|
|
291
|
+
// flush held transcripts if possible
|
|
292
|
+
await this.flushHeldTranscripts();
|
|
293
|
+
}
|
|
294
|
+
this.isAgentSpeaking = false;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/** Start interruption inference when agent is speaking and overlap speech starts. */
|
|
298
|
+
async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) {
|
|
299
|
+
if (this.isAgentSpeaking) {
|
|
300
|
+
this.trySendInterruptionSentinel(
|
|
301
|
+
InterruptionStreamSentinel.overlapSpeechStarted(
|
|
302
|
+
speechDuration,
|
|
303
|
+
startedAt,
|
|
304
|
+
userSpeakingSpan,
|
|
305
|
+
),
|
|
306
|
+
);
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/** End interruption inference when overlap speech ends. */
|
|
311
|
+
async onEndOfOverlapSpeech(endedAt: number, userSpeakingSpan?: Span) {
|
|
312
|
+
if (!this.isInterruptionEnabled) {
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
if (userSpeakingSpan && userSpeakingSpan.isRecording()) {
|
|
316
|
+
userSpeakingSpan.setAttribute(traceTypes.ATTR_IS_INTERRUPTION, 'false');
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
return this.trySendInterruptionSentinel(InterruptionStreamSentinel.overlapSpeechEnded(endedAt));
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp.
|
|
324
|
+
* If the event has no timestamps, we assume it is the same as the next valid event.
|
|
325
|
+
*/
|
|
326
|
+
private async flushHeldTranscripts() {
|
|
327
|
+
if (
|
|
328
|
+
!this.isInterruptionEnabled ||
|
|
329
|
+
this.ignoreUserTranscriptUntil === undefined ||
|
|
330
|
+
this.transcriptBuffer.length === 0
|
|
331
|
+
) {
|
|
332
|
+
return;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if (!this._inputStartedAt) {
|
|
336
|
+
this.transcriptBuffer = [];
|
|
337
|
+
this.ignoreUserTranscriptUntil = undefined;
|
|
338
|
+
return;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
let emitFromIndex: number | null = null;
|
|
342
|
+
let shouldFlush = false;
|
|
343
|
+
|
|
344
|
+
for (let i = 0; i < this.transcriptBuffer.length; i++) {
|
|
345
|
+
const ev = this.transcriptBuffer[i];
|
|
346
|
+
if (!ev || !ev.alternatives || ev.alternatives.length === 0) {
|
|
347
|
+
emitFromIndex = Math.min(emitFromIndex ?? i, i);
|
|
348
|
+
continue;
|
|
349
|
+
}
|
|
350
|
+
const firstAlternative = ev.alternatives[0];
|
|
351
|
+
if (
|
|
352
|
+
firstAlternative.startTime === firstAlternative.endTime &&
|
|
353
|
+
firstAlternative.startTime === 0
|
|
354
|
+
) {
|
|
355
|
+
this.transcriptBuffer = [];
|
|
356
|
+
this.ignoreUserTranscriptUntil = undefined;
|
|
357
|
+
return;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
if (this.#alternativeEndsBeforeIgnoreWindow(firstAlternative)) {
|
|
361
|
+
emitFromIndex = null;
|
|
362
|
+
} else {
|
|
363
|
+
emitFromIndex = Math.min(emitFromIndex ?? i, i);
|
|
364
|
+
shouldFlush = true;
|
|
365
|
+
break;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
const eventsToEmit =
|
|
370
|
+
emitFromIndex !== null && shouldFlush ? this.transcriptBuffer.slice(emitFromIndex) : [];
|
|
371
|
+
|
|
372
|
+
this.transcriptBuffer = [];
|
|
373
|
+
this.ignoreUserTranscriptUntil = undefined;
|
|
374
|
+
|
|
375
|
+
for (const event of eventsToEmit) {
|
|
376
|
+
this.logger.trace(
|
|
377
|
+
{
|
|
378
|
+
event: event.type,
|
|
379
|
+
},
|
|
380
|
+
're-emitting held user transcript',
|
|
381
|
+
);
|
|
382
|
+
this.onSTTEvent(event);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
#alternativeEndsBeforeIgnoreWindow(
|
|
387
|
+
alternative: NonNullable<SpeechEvent['alternatives']>[number],
|
|
388
|
+
): boolean {
|
|
389
|
+
if (
|
|
390
|
+
this.ignoreUserTranscriptUntil === undefined ||
|
|
391
|
+
!this._inputStartedAt ||
|
|
392
|
+
alternative.startTime <= 0
|
|
393
|
+
) {
|
|
394
|
+
return false;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// `SpeechData.startTime` is in seconds relative to audio start, while `inputStartedAt` and
|
|
398
|
+
// `ignoreUserTranscriptUntil` are epoch milliseconds.
|
|
399
|
+
return alternative.startTime * 1000 + this._inputStartedAt < this.ignoreUserTranscriptUntil;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
private shouldHoldSttEvent(ev: SpeechEvent): boolean {
|
|
403
|
+
if (!this.isInterruptionEnabled) {
|
|
404
|
+
return false;
|
|
405
|
+
}
|
|
406
|
+
if (this.isAgentSpeaking) {
|
|
407
|
+
return true;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// reset when the user starts speaking after the agent speech
|
|
411
|
+
if (ev.type === SpeechEventType.START_OF_SPEECH) {
|
|
412
|
+
this.ignoreUserTranscriptUntil = undefined;
|
|
413
|
+
this.transcriptBuffer = [];
|
|
414
|
+
return false;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
if (this.ignoreUserTranscriptUntil === undefined) {
|
|
418
|
+
return false;
|
|
419
|
+
}
|
|
420
|
+
// sentinel events are always held until we have something concrete to release them
|
|
421
|
+
if (!ev.alternatives || ev.alternatives.length === 0) {
|
|
422
|
+
return true;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
const alternative = ev.alternatives[0];
|
|
426
|
+
|
|
427
|
+
if (
|
|
428
|
+
alternative.startTime !== alternative.endTime &&
|
|
429
|
+
this.#alternativeEndsBeforeIgnoreWindow(alternative)
|
|
430
|
+
) {
|
|
431
|
+
return true;
|
|
432
|
+
}
|
|
433
|
+
return false;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
private async trySendInterruptionSentinel(
|
|
437
|
+
frame: AudioFrame | InterruptionSentinel,
|
|
438
|
+
): Promise<boolean> {
|
|
439
|
+
if (
|
|
440
|
+
this.isInterruptionEnabled &&
|
|
441
|
+
this.interruptionStreamChannel &&
|
|
442
|
+
!this.interruptionStreamChannel.closed
|
|
443
|
+
) {
|
|
444
|
+
try {
|
|
445
|
+
await this.interruptionStreamChannel.write(frame);
|
|
446
|
+
return true;
|
|
447
|
+
} catch (e: unknown) {
|
|
448
|
+
this.logger.warn(
|
|
449
|
+
`could not forward interruption sentinel: ${e instanceof Error ? e.message : String(e)}`,
|
|
450
|
+
);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
return false;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
private ensureUserTurnSpan(startTime?: number): Span {
|
|
457
|
+
if (this.userTurnSpan && this.userTurnSpan.isRecording()) {
|
|
458
|
+
return this.userTurnSpan;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
this.userTurnSpan = tracer.startSpan({
|
|
462
|
+
name: 'user_turn',
|
|
463
|
+
context: this.rootSpanContext,
|
|
464
|
+
startTime,
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
const participant = this.getLinkedParticipant?.();
|
|
468
|
+
if (participant) {
|
|
469
|
+
setParticipantSpanAttributes(this.userTurnSpan, participant);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
if (this.sttModel) {
|
|
473
|
+
this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel);
|
|
474
|
+
}
|
|
475
|
+
if (this.sttProvider) {
|
|
476
|
+
this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, this.sttProvider);
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
return this.userTurnSpan;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
private userTurnContext(span: Span): Context {
|
|
483
|
+
const base = this.rootSpanContext ?? ROOT_CONTEXT;
|
|
484
|
+
return trace.setSpan(base, span);
|
|
119
485
|
}
|
|
120
486
|
|
|
121
487
|
private async onSTTEvent(ev: SpeechEvent) {
|
|
@@ -140,10 +506,29 @@ export class AudioRecognition {
|
|
|
140
506
|
return;
|
|
141
507
|
}
|
|
142
508
|
|
|
509
|
+
// handle interruption detection
|
|
510
|
+
// - hold the event until the ignore_user_transcript_until expires
|
|
511
|
+
// - release only relevant events
|
|
512
|
+
// - allow RECOGNITION_USAGE to pass through immediately
|
|
513
|
+
|
|
514
|
+
if (ev.type !== SpeechEventType.RECOGNITION_USAGE && this.isInterruptionEnabled) {
|
|
515
|
+
if (this.shouldHoldSttEvent(ev)) {
|
|
516
|
+
this.logger.trace(
|
|
517
|
+
{ event: ev.type, ignoreUserTranscriptUntil: this.ignoreUserTranscriptUntil },
|
|
518
|
+
'holding STT event until ignore_user_transcript_until expires',
|
|
519
|
+
);
|
|
520
|
+
this.transcriptBuffer.push(ev);
|
|
521
|
+
return;
|
|
522
|
+
} else {
|
|
523
|
+
await this.flushHeldTranscripts();
|
|
524
|
+
// no return here to allow the new event to be processed normally
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
143
528
|
switch (ev.type) {
|
|
144
529
|
case SpeechEventType.FINAL_TRANSCRIPT:
|
|
145
|
-
this.hooks.onFinalTranscript(ev);
|
|
146
530
|
const transcript = ev.alternatives?.[0]?.text;
|
|
531
|
+
const confidence = ev.alternatives?.[0]?.confidence ?? 0;
|
|
147
532
|
this.lastLanguage = ev.alternatives?.[0]?.language;
|
|
148
533
|
|
|
149
534
|
if (!transcript) {
|
|
@@ -151,6 +536,8 @@ export class AudioRecognition {
|
|
|
151
536
|
return;
|
|
152
537
|
}
|
|
153
538
|
|
|
539
|
+
this.hooks.onFinalTranscript(ev);
|
|
540
|
+
|
|
154
541
|
this.logger.debug(
|
|
155
542
|
{
|
|
156
543
|
user_transcript: transcript,
|
|
@@ -162,34 +549,156 @@ export class AudioRecognition {
|
|
|
162
549
|
this.lastFinalTranscriptTime = Date.now();
|
|
163
550
|
this.audioTranscript += ` ${transcript}`;
|
|
164
551
|
this.audioTranscript = this.audioTranscript.trimStart();
|
|
552
|
+
this.finalTranscriptConfidence.push(confidence);
|
|
553
|
+
const transcriptChanged = this.audioTranscript !== this.audioPreflightTranscript;
|
|
165
554
|
this.audioInterimTranscript = '';
|
|
555
|
+
this.audioPreflightTranscript = '';
|
|
556
|
+
|
|
557
|
+
if (!this.vad || this.lastSpeakingTime === undefined) {
|
|
558
|
+
// vad disabled, use stt timestamp
|
|
559
|
+
// TODO: this would screw up transcription latency metrics
|
|
560
|
+
// but we'll live with it for now.
|
|
561
|
+
// the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
|
|
562
|
+
// and using that timestamp for lastSpeakingTime
|
|
563
|
+
this.lastSpeakingTime = Date.now();
|
|
564
|
+
}
|
|
166
565
|
|
|
167
|
-
if (
|
|
168
|
-
if (
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
566
|
+
if (this.vadBaseTurnDetection || this.userTurnCommitted) {
|
|
567
|
+
if (transcriptChanged) {
|
|
568
|
+
this.logger.debug(
|
|
569
|
+
{ transcript: this.audioTranscript },
|
|
570
|
+
'triggering preemptive generation (FINAL_TRANSCRIPT)',
|
|
571
|
+
);
|
|
572
|
+
this.hooks.onPreemptiveGeneration({
|
|
573
|
+
newTranscript: this.audioTranscript,
|
|
574
|
+
transcriptConfidence:
|
|
575
|
+
this.finalTranscriptConfidence.length > 0
|
|
576
|
+
? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
|
|
577
|
+
this.finalTranscriptConfidence.length
|
|
578
|
+
: 0,
|
|
579
|
+
});
|
|
176
580
|
}
|
|
177
581
|
|
|
178
|
-
if (this.
|
|
582
|
+
if (!this.speaking) {
|
|
179
583
|
const chatCtx = this.hooks.retrieveChatCtx();
|
|
180
584
|
this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT');
|
|
181
585
|
this.runEOUDetection(chatCtx);
|
|
182
586
|
}
|
|
183
587
|
}
|
|
184
588
|
break;
|
|
589
|
+
case SpeechEventType.PREFLIGHT_TRANSCRIPT:
|
|
590
|
+
this.hooks.onInterimTranscript(ev);
|
|
591
|
+
const preflightTranscript = ev.alternatives?.[0]?.text ?? '';
|
|
592
|
+
const preflightConfidence = ev.alternatives?.[0]?.confidence ?? 0;
|
|
593
|
+
const preflightLanguage = ev.alternatives?.[0]?.language;
|
|
594
|
+
|
|
595
|
+
const MIN_LANGUAGE_DETECTION_LENGTH = 5;
|
|
596
|
+
if (
|
|
597
|
+
!this.lastLanguage ||
|
|
598
|
+
(preflightLanguage && preflightTranscript.length > MIN_LANGUAGE_DETECTION_LENGTH)
|
|
599
|
+
) {
|
|
600
|
+
this.lastLanguage = preflightLanguage;
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
if (!preflightTranscript) {
|
|
604
|
+
return;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
this.logger.debug(
|
|
608
|
+
{
|
|
609
|
+
user_transcript: preflightTranscript,
|
|
610
|
+
language: this.lastLanguage,
|
|
611
|
+
},
|
|
612
|
+
'received user preflight transcript',
|
|
613
|
+
);
|
|
614
|
+
|
|
615
|
+
// still need to increment it as it's used for turn detection,
|
|
616
|
+
this.lastFinalTranscriptTime = Date.now();
|
|
617
|
+
// preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
|
|
618
|
+
this.audioPreflightTranscript =
|
|
619
|
+
`${this.audioTranscript} ${preflightTranscript}`.trimStart();
|
|
620
|
+
this.audioInterimTranscript = preflightTranscript;
|
|
621
|
+
|
|
622
|
+
if (!this.vad || this.lastSpeakingTime === undefined) {
|
|
623
|
+
// vad disabled, use stt timestamp
|
|
624
|
+
this.lastSpeakingTime = Date.now();
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
if (this.turnDetectionMode !== 'manual' || this.userTurnCommitted) {
|
|
628
|
+
const confidenceVals = [...this.finalTranscriptConfidence, preflightConfidence];
|
|
629
|
+
this.logger.debug(
|
|
630
|
+
{
|
|
631
|
+
transcript:
|
|
632
|
+
this.audioPreflightTranscript.length > 100
|
|
633
|
+
? this.audioPreflightTranscript.slice(0, 100) + '...'
|
|
634
|
+
: this.audioPreflightTranscript,
|
|
635
|
+
},
|
|
636
|
+
'triggering preemptive generation (PREFLIGHT_TRANSCRIPT)',
|
|
637
|
+
);
|
|
638
|
+
this.hooks.onPreemptiveGeneration({
|
|
639
|
+
newTranscript: this.audioPreflightTranscript,
|
|
640
|
+
transcriptConfidence:
|
|
641
|
+
confidenceVals.length > 0
|
|
642
|
+
? confidenceVals.reduce((a, b) => a + b, 0) / confidenceVals.length
|
|
643
|
+
: 0,
|
|
644
|
+
});
|
|
645
|
+
}
|
|
646
|
+
break;
|
|
185
647
|
case SpeechEventType.INTERIM_TRANSCRIPT:
|
|
186
648
|
this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript');
|
|
187
649
|
this.hooks.onInterimTranscript(ev);
|
|
188
650
|
this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? '';
|
|
189
651
|
break;
|
|
652
|
+
case SpeechEventType.START_OF_SPEECH:
|
|
653
|
+
if (this.turnDetectionMode !== 'stt') break;
|
|
654
|
+
{
|
|
655
|
+
const span = this.ensureUserTurnSpan(Date.now());
|
|
656
|
+
const ctx = this.userTurnContext(span);
|
|
657
|
+
otelContext.with(ctx, () => {
|
|
658
|
+
this.hooks.onStartOfSpeech({
|
|
659
|
+
type: VADEventType.START_OF_SPEECH,
|
|
660
|
+
samplesIndex: 0,
|
|
661
|
+
timestamp: Date.now(),
|
|
662
|
+
speechDuration: 0,
|
|
663
|
+
silenceDuration: 0,
|
|
664
|
+
frames: [],
|
|
665
|
+
probability: 0,
|
|
666
|
+
inferenceDuration: 0,
|
|
667
|
+
speaking: true,
|
|
668
|
+
rawAccumulatedSilence: 0,
|
|
669
|
+
rawAccumulatedSpeech: 0,
|
|
670
|
+
});
|
|
671
|
+
});
|
|
672
|
+
}
|
|
673
|
+
this.speaking = true;
|
|
674
|
+
this.lastSpeakingTime = Date.now();
|
|
675
|
+
|
|
676
|
+
this.bounceEOUTask?.cancel();
|
|
677
|
+
break;
|
|
190
678
|
case SpeechEventType.END_OF_SPEECH:
|
|
191
679
|
if (this.turnDetectionMode !== 'stt') break;
|
|
680
|
+
{
|
|
681
|
+
const span = this.ensureUserTurnSpan();
|
|
682
|
+
const ctx = this.userTurnContext(span);
|
|
683
|
+
otelContext.with(ctx, () => {
|
|
684
|
+
this.hooks.onEndOfSpeech({
|
|
685
|
+
type: VADEventType.END_OF_SPEECH,
|
|
686
|
+
samplesIndex: 0,
|
|
687
|
+
timestamp: Date.now(),
|
|
688
|
+
speechDuration: 0,
|
|
689
|
+
silenceDuration: 0,
|
|
690
|
+
frames: [],
|
|
691
|
+
probability: 0,
|
|
692
|
+
inferenceDuration: 0,
|
|
693
|
+
speaking: false,
|
|
694
|
+
rawAccumulatedSilence: 0,
|
|
695
|
+
rawAccumulatedSpeech: 0,
|
|
696
|
+
});
|
|
697
|
+
});
|
|
698
|
+
}
|
|
699
|
+
this.speaking = false;
|
|
192
700
|
this.userTurnCommitted = true;
|
|
701
|
+
this.lastSpeakingTime = Date.now();
|
|
193
702
|
|
|
194
703
|
if (!this.speaking) {
|
|
195
704
|
const chatCtx = this.hooks.retrieveChatCtx();
|
|
@@ -199,6 +708,12 @@ export class AudioRecognition {
|
|
|
199
708
|
}
|
|
200
709
|
}
|
|
201
710
|
|
|
711
|
+
private onOverlapSpeechEvent(ev: OverlappingSpeechEvent) {
|
|
712
|
+
if (ev.isInterruption) {
|
|
713
|
+
this.hooks.onInterruption(ev);
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
|
|
202
717
|
private runEOUDetection(chatCtx: ChatContext) {
|
|
203
718
|
this.logger.debug(
|
|
204
719
|
{
|
|
@@ -222,61 +737,132 @@ export class AudioRecognition {
|
|
|
222
737
|
// disable EOU model if manual turn detection enabled
|
|
223
738
|
this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined;
|
|
224
739
|
|
|
225
|
-
const bounceEOUTask =
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
740
|
+
const bounceEOUTask =
|
|
741
|
+
(
|
|
742
|
+
lastSpeakingTime: number | undefined,
|
|
743
|
+
lastFinalTranscriptTime: number,
|
|
744
|
+
speechStartTime: number | undefined,
|
|
745
|
+
) =>
|
|
746
|
+
async (controller: AbortController) => {
|
|
747
|
+
let endpointingDelay = this.minEndpointingDelay;
|
|
748
|
+
|
|
749
|
+
const userTurnSpan = this.ensureUserTurnSpan();
|
|
750
|
+
const userTurnCtx = this.userTurnContext(userTurnSpan);
|
|
751
|
+
|
|
752
|
+
if (turnDetector) {
|
|
753
|
+
await tracer.startActiveSpan(
|
|
754
|
+
async (span) => {
|
|
755
|
+
this.logger.debug('Running turn detector model');
|
|
756
|
+
|
|
757
|
+
let endOfTurnProbability = 0.0;
|
|
758
|
+
let unlikelyThreshold: number | undefined;
|
|
759
|
+
|
|
760
|
+
if (!(await turnDetector.supportsLanguage(this.lastLanguage))) {
|
|
761
|
+
this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
|
|
762
|
+
} else {
|
|
763
|
+
try {
|
|
764
|
+
endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
|
|
765
|
+
unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
|
|
766
|
+
|
|
767
|
+
this.logger.debug(
|
|
768
|
+
{ endOfTurnProbability, unlikelyThreshold, language: this.lastLanguage },
|
|
769
|
+
'end of turn probability',
|
|
770
|
+
);
|
|
771
|
+
|
|
772
|
+
if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
|
|
773
|
+
endpointingDelay = this.maxEndpointingDelay;
|
|
774
|
+
}
|
|
775
|
+
} catch (error) {
|
|
776
|
+
this.logger.error(error, 'Error predicting end of turn');
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
span.setAttribute(
|
|
781
|
+
traceTypes.ATTR_CHAT_CTX,
|
|
782
|
+
JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })),
|
|
783
|
+
);
|
|
784
|
+
span.setAttribute(traceTypes.ATTR_EOU_PROBABILITY, endOfTurnProbability);
|
|
785
|
+
span.setAttribute(traceTypes.ATTR_EOU_UNLIKELY_THRESHOLD, unlikelyThreshold ?? 0);
|
|
786
|
+
span.setAttribute(traceTypes.ATTR_EOU_DELAY, endpointingDelay);
|
|
787
|
+
span.setAttribute(traceTypes.ATTR_EOU_LANGUAGE, this.lastLanguage ?? '');
|
|
788
|
+
},
|
|
242
789
|
{
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
language: this.lastLanguage,
|
|
246
|
-
transcript: this.audioTranscript,
|
|
790
|
+
name: 'eou_detection',
|
|
791
|
+
context: userTurnCtx,
|
|
247
792
|
},
|
|
248
|
-
'EOU Detection',
|
|
249
793
|
);
|
|
250
|
-
|
|
251
|
-
if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
|
|
252
|
-
endpointingDelay = this.maxEndpointingDelay;
|
|
253
|
-
}
|
|
254
794
|
}
|
|
255
|
-
}
|
|
256
795
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
796
|
+
let extraSleep = endpointingDelay;
|
|
797
|
+
if (lastSpeakingTime !== undefined) {
|
|
798
|
+
extraSleep += lastSpeakingTime - Date.now();
|
|
799
|
+
}
|
|
260
800
|
|
|
261
|
-
|
|
801
|
+
if (extraSleep > 0) {
|
|
802
|
+
// add delay to see if there's a potential upcoming EOU task that cancels this one
|
|
803
|
+
await delay(Math.max(extraSleep, 0), { signal: controller.signal });
|
|
804
|
+
}
|
|
262
805
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
806
|
+
this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
|
|
807
|
+
|
|
808
|
+
const confidenceAvg =
|
|
809
|
+
this.finalTranscriptConfidence.length > 0
|
|
810
|
+
? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
|
|
811
|
+
this.finalTranscriptConfidence.length
|
|
812
|
+
: 0;
|
|
813
|
+
|
|
814
|
+
let startedSpeakingAt: number | undefined;
|
|
815
|
+
let stoppedSpeakingAt: number | undefined;
|
|
816
|
+
let transcriptionDelay: number | undefined;
|
|
817
|
+
let endOfUtteranceDelay: number | undefined;
|
|
818
|
+
|
|
819
|
+
// sometimes, we can't calculate the metrics because VAD was unreliable.
|
|
820
|
+
// in this case, we just ignore the calculation, it's better than providing likely wrong values
|
|
821
|
+
if (
|
|
822
|
+
lastFinalTranscriptTime !== 0 &&
|
|
823
|
+
lastSpeakingTime !== undefined &&
|
|
824
|
+
speechStartTime !== undefined
|
|
825
|
+
) {
|
|
826
|
+
startedSpeakingAt = speechStartTime;
|
|
827
|
+
stoppedSpeakingAt = lastSpeakingTime;
|
|
828
|
+
transcriptionDelay = Math.max(lastFinalTranscriptTime - lastSpeakingTime, 0);
|
|
829
|
+
endOfUtteranceDelay = Date.now() - lastSpeakingTime;
|
|
830
|
+
}
|
|
268
831
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
832
|
+
const committed = await this.hooks.onEndOfTurn({
|
|
833
|
+
newTranscript: this.audioTranscript,
|
|
834
|
+
transcriptConfidence: confidenceAvg,
|
|
835
|
+
transcriptionDelay: transcriptionDelay ?? 0,
|
|
836
|
+
endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
|
|
837
|
+
startedSpeakingAt,
|
|
838
|
+
stoppedSpeakingAt,
|
|
839
|
+
});
|
|
840
|
+
|
|
841
|
+
if (committed) {
|
|
842
|
+
this._endUserTurnSpan({
|
|
843
|
+
transcript: this.audioTranscript,
|
|
844
|
+
confidence: confidenceAvg,
|
|
845
|
+
transcriptionDelay: transcriptionDelay ?? 0,
|
|
846
|
+
endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
|
|
847
|
+
});
|
|
848
|
+
|
|
849
|
+
// clear the transcript if the user turn was committed
|
|
850
|
+
this.audioTranscript = '';
|
|
851
|
+
this.finalTranscriptConfidence = [];
|
|
852
|
+
this.lastSpeakingTime = undefined;
|
|
853
|
+
this.lastFinalTranscriptTime = 0;
|
|
854
|
+
this.speechStartTime = undefined;
|
|
855
|
+
}
|
|
273
856
|
|
|
274
|
-
|
|
275
|
-
|
|
857
|
+
this.userTurnCommitted = false;
|
|
858
|
+
};
|
|
276
859
|
|
|
277
860
|
// cancel any existing EOU task
|
|
278
861
|
this.bounceEOUTask?.cancel();
|
|
279
|
-
|
|
862
|
+
// copy the values before awaiting (the values can change)
|
|
863
|
+
this.bounceEOUTask = Task.from(
|
|
864
|
+
bounceEOUTask(this.lastSpeakingTime, this.lastFinalTranscriptTime, this.speechStartTime),
|
|
865
|
+
);
|
|
280
866
|
|
|
281
867
|
this.bounceEOUTask.result
|
|
282
868
|
.then(() => {
|
|
@@ -364,7 +950,12 @@ export class AudioRecognition {
|
|
|
364
950
|
switch (ev.type) {
|
|
365
951
|
case VADEventType.START_OF_SPEECH:
|
|
366
952
|
this.logger.debug('VAD task: START_OF_SPEECH');
|
|
367
|
-
|
|
953
|
+
{
|
|
954
|
+
const startTime = Date.now() - ev.speechDuration;
|
|
955
|
+
const span = this.ensureUserTurnSpan(startTime);
|
|
956
|
+
const ctx = this.userTurnContext(span);
|
|
957
|
+
otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev));
|
|
958
|
+
}
|
|
368
959
|
this.speaking = true;
|
|
369
960
|
|
|
370
961
|
// Capture sample rate from the first VAD event if not already set
|
|
@@ -376,13 +967,27 @@ export class AudioRecognition {
|
|
|
376
967
|
break;
|
|
377
968
|
case VADEventType.INFERENCE_DONE:
|
|
378
969
|
this.hooks.onVADInferenceDone(ev);
|
|
970
|
+
// for metrics, get the "earliest" signal of speech as possible
|
|
971
|
+
if (ev.rawAccumulatedSpeech > 0.0) {
|
|
972
|
+
this.lastSpeakingTime = Date.now();
|
|
973
|
+
|
|
974
|
+
if (this.speechStartTime === undefined) {
|
|
975
|
+
// Backdate speechStartTime to the actual start of accumulated speech.
|
|
976
|
+
// ev.rawAccumulatedSpeech is in ms (VADEvent durations are all ms in TS).
|
|
977
|
+
this.speechStartTime = Date.now() - ev.rawAccumulatedSpeech;
|
|
978
|
+
}
|
|
979
|
+
}
|
|
379
980
|
break;
|
|
380
981
|
case VADEventType.END_OF_SPEECH:
|
|
381
982
|
this.logger.debug('VAD task: END_OF_SPEECH');
|
|
382
|
-
|
|
383
|
-
|
|
983
|
+
{
|
|
984
|
+
const span = this.ensureUserTurnSpan();
|
|
985
|
+
const ctx = this.userTurnContext(span);
|
|
986
|
+
otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev));
|
|
987
|
+
}
|
|
988
|
+
|
|
384
989
|
// when VAD fires END_OF_SPEECH, it already waited for the silence_duration
|
|
385
|
-
this.
|
|
990
|
+
this.speaking = false;
|
|
386
991
|
|
|
387
992
|
if (
|
|
388
993
|
this.vadBaseTurnDetection ||
|
|
@@ -401,6 +1006,136 @@ export class AudioRecognition {
|
|
|
401
1006
|
}
|
|
402
1007
|
}
|
|
403
1008
|
|
|
1009
|
+
private async createInterruptionTask(
|
|
1010
|
+
interruptionDetection: AdaptiveInterruptionDetector | undefined,
|
|
1011
|
+
signal: AbortSignal,
|
|
1012
|
+
) {
|
|
1013
|
+
if (!interruptionDetection || !this.interruptionStreamChannel) return;
|
|
1014
|
+
|
|
1015
|
+
let numRetries = 0;
|
|
1016
|
+
const maxRetries = apiConnectDefaults.maxRetries;
|
|
1017
|
+
|
|
1018
|
+
while (!signal.aborted) {
|
|
1019
|
+
const stream = interruptionDetection.createStream();
|
|
1020
|
+
const eventReader = stream.stream().getReader();
|
|
1021
|
+
|
|
1022
|
+
const cleanup = async () => {
|
|
1023
|
+
try {
|
|
1024
|
+
signal.removeEventListener('abort', cleanup);
|
|
1025
|
+
eventReader.releaseLock();
|
|
1026
|
+
await stream.close();
|
|
1027
|
+
} catch (e) {
|
|
1028
|
+
this.logger.debug('createInterruptionTask: error during cleanup:', e);
|
|
1029
|
+
}
|
|
1030
|
+
};
|
|
1031
|
+
|
|
1032
|
+
signal.addEventListener('abort', cleanup, { once: true });
|
|
1033
|
+
|
|
1034
|
+
let forwardTask: Promise<void> | undefined;
|
|
1035
|
+
|
|
1036
|
+
try {
|
|
1037
|
+
// Unlike Python where _agent_speech_started lives on `self` and survives retries,
|
|
1038
|
+
// JS creates a fresh InterruptionStreamBase per retry with agentSpeechStarted = false.
|
|
1039
|
+
// Re-inject the sentinel so the new stream knows the agent is mid-speech.
|
|
1040
|
+
if (numRetries > 0 && this.isAgentSpeaking) {
|
|
1041
|
+
await stream.pushFrame(InterruptionStreamSentinel.agentSpeechStarted());
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
forwardTask = (async () => {
|
|
1045
|
+
const inputReader = this.interruptionStreamChannel!.stream().getReader();
|
|
1046
|
+
const abortPromise = waitForAbort(signal);
|
|
1047
|
+
|
|
1048
|
+
try {
|
|
1049
|
+
while (!signal.aborted) {
|
|
1050
|
+
const res = await Promise.race([inputReader.read(), abortPromise]);
|
|
1051
|
+
if (!res) break;
|
|
1052
|
+
|
|
1053
|
+
const { value, done } = res;
|
|
1054
|
+
if (done) break;
|
|
1055
|
+
|
|
1056
|
+
if (value instanceof AudioFrame) {
|
|
1057
|
+
const frameDurationMs = (value.samplesPerChannel / value.sampleRate) * 1000;
|
|
1058
|
+
this._inputStartedAt ??= Date.now() - frameDurationMs;
|
|
1059
|
+
} else {
|
|
1060
|
+
this._inputStartedAt ??= Date.now();
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
await stream.pushFrame(value);
|
|
1064
|
+
}
|
|
1065
|
+
} finally {
|
|
1066
|
+
inputReader.releaseLock();
|
|
1067
|
+
}
|
|
1068
|
+
})();
|
|
1069
|
+
|
|
1070
|
+
const abortPromise = waitForAbort(signal);
|
|
1071
|
+
|
|
1072
|
+
while (!signal.aborted) {
|
|
1073
|
+
const res = await Promise.race([eventReader.read(), abortPromise]);
|
|
1074
|
+
if (!res) break;
|
|
1075
|
+
const { done, value: ev } = res;
|
|
1076
|
+
if (done) break;
|
|
1077
|
+
this.onOverlapSpeechEvent(ev);
|
|
1078
|
+
}
|
|
1079
|
+
break;
|
|
1080
|
+
} catch (e) {
|
|
1081
|
+
if (signal.aborted) break;
|
|
1082
|
+
|
|
1083
|
+
if (isAPIError(e)) {
|
|
1084
|
+
if (maxRetries === 0 || !e.retryable) {
|
|
1085
|
+
interruptionDetection.emitError(
|
|
1086
|
+
new InterruptionDetectionError(
|
|
1087
|
+
e.message,
|
|
1088
|
+
Date.now(),
|
|
1089
|
+
interruptionDetection.label,
|
|
1090
|
+
false,
|
|
1091
|
+
),
|
|
1092
|
+
);
|
|
1093
|
+
break;
|
|
1094
|
+
} else if (numRetries >= maxRetries) {
|
|
1095
|
+
interruptionDetection.emitError(
|
|
1096
|
+
new InterruptionDetectionError(
|
|
1097
|
+
`failed to detect interruption after ${numRetries} attempts`,
|
|
1098
|
+
Date.now(),
|
|
1099
|
+
interruptionDetection.label,
|
|
1100
|
+
false,
|
|
1101
|
+
),
|
|
1102
|
+
);
|
|
1103
|
+
break;
|
|
1104
|
+
} else {
|
|
1105
|
+
const retryInterval = intervalForRetry(numRetries);
|
|
1106
|
+
interruptionDetection.emitError(
|
|
1107
|
+
new InterruptionDetectionError(
|
|
1108
|
+
e.message,
|
|
1109
|
+
Date.now(),
|
|
1110
|
+
interruptionDetection.label,
|
|
1111
|
+
true,
|
|
1112
|
+
),
|
|
1113
|
+
);
|
|
1114
|
+
this.logger.warn(
|
|
1115
|
+
{ model: interruptionDetection.label, attempt: numRetries },
|
|
1116
|
+
`failed to detect interruption, retrying in ${retryInterval}ms`,
|
|
1117
|
+
);
|
|
1118
|
+
numRetries++;
|
|
1119
|
+
await delay(retryInterval, { signal });
|
|
1120
|
+
}
|
|
1121
|
+
} else {
|
|
1122
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1123
|
+
interruptionDetection.emitError(
|
|
1124
|
+
new InterruptionDetectionError(msg, Date.now(), interruptionDetection.label, false),
|
|
1125
|
+
);
|
|
1126
|
+
this.logger.error(e, 'Error in interruption task');
|
|
1127
|
+
break;
|
|
1128
|
+
}
|
|
1129
|
+
} finally {
|
|
1130
|
+
await cleanup();
|
|
1131
|
+
await forwardTask?.catch((e) => {
|
|
1132
|
+
this.logger.debug({ err: e }, 'interruption task exited with error');
|
|
1133
|
+
});
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
this.logger.debug('Interruption task closed');
|
|
1137
|
+
}
|
|
1138
|
+
|
|
404
1139
|
setInputAudioStream(audioStream: ReadableStream<AudioFrame>) {
|
|
405
1140
|
this.deferredInputStream.setSource(audioStream);
|
|
406
1141
|
}
|
|
@@ -412,6 +1147,8 @@ export class AudioRecognition {
|
|
|
412
1147
|
clearUserTurn() {
|
|
413
1148
|
this.audioTranscript = '';
|
|
414
1149
|
this.audioInterimTranscript = '';
|
|
1150
|
+
this.audioPreflightTranscript = '';
|
|
1151
|
+
this.finalTranscriptConfidence = [];
|
|
415
1152
|
this.userTurnCommitted = false;
|
|
416
1153
|
|
|
417
1154
|
this.sttTask?.cancelAndWait().finally(() => {
|
|
@@ -460,19 +1197,57 @@ export class AudioRecognition {
|
|
|
460
1197
|
this.logger.debug('User turn committed');
|
|
461
1198
|
})
|
|
462
1199
|
.catch((err: unknown) => {
|
|
1200
|
+
if (err instanceof Error && err.name === 'AbortError') {
|
|
1201
|
+
this.logger.debug('User turn commit task cancelled');
|
|
1202
|
+
return;
|
|
1203
|
+
}
|
|
463
1204
|
this.logger.error(err, 'Error in user turn commit task:');
|
|
464
1205
|
});
|
|
465
1206
|
}
|
|
466
1207
|
|
|
467
1208
|
async close() {
|
|
468
1209
|
this.detachInputAudioStream();
|
|
1210
|
+
this.silenceAudioWriter.releaseLock();
|
|
469
1211
|
await this.commitUserTurnTask?.cancelAndWait();
|
|
470
1212
|
await this.sttTask?.cancelAndWait();
|
|
471
1213
|
await this.vadTask?.cancelAndWait();
|
|
472
1214
|
await this.bounceEOUTask?.cancelAndWait();
|
|
1215
|
+
await this.interruptionTask?.cancelAndWait();
|
|
1216
|
+
await this.interruptionStreamChannel?.close();
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1219
|
+
private _endUserTurnSpan({
|
|
1220
|
+
transcript,
|
|
1221
|
+
confidence,
|
|
1222
|
+
transcriptionDelay,
|
|
1223
|
+
endOfUtteranceDelay,
|
|
1224
|
+
}: {
|
|
1225
|
+
transcript: string;
|
|
1226
|
+
confidence: number;
|
|
1227
|
+
transcriptionDelay: number;
|
|
1228
|
+
endOfUtteranceDelay: number;
|
|
1229
|
+
}): void {
|
|
1230
|
+
if (this.userTurnSpan) {
|
|
1231
|
+
this.userTurnSpan.setAttributes({
|
|
1232
|
+
[traceTypes.ATTR_USER_TRANSCRIPT]: transcript,
|
|
1233
|
+
[traceTypes.ATTR_TRANSCRIPT_CONFIDENCE]: confidence,
|
|
1234
|
+
[traceTypes.ATTR_TRANSCRIPTION_DELAY]: transcriptionDelay,
|
|
1235
|
+
[traceTypes.ATTR_END_OF_TURN_DELAY]: endOfUtteranceDelay,
|
|
1236
|
+
});
|
|
1237
|
+
this.userTurnSpan.end();
|
|
1238
|
+
this.userTurnSpan = undefined;
|
|
1239
|
+
}
|
|
473
1240
|
}
|
|
474
1241
|
|
|
475
1242
|
private get vadBaseTurnDetection() {
|
|
476
|
-
|
|
1243
|
+
if (typeof this.turnDetectionMode === 'object') {
|
|
1244
|
+
return false;
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
if (this.turnDetectionMode === undefined || this.turnDetectionMode === 'vad') {
|
|
1248
|
+
return true;
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
return false;
|
|
477
1252
|
}
|
|
478
1253
|
}
|