@livekit/agents 1.0.47 → 1.1.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +162 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +138 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/constants.cjs +27 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +9 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +18 -0
- package/dist/constants.js.map +1 -1
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.d.cts +12 -12
- package/dist/inference/api_protos.d.ts +12 -12
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +147 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +121 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +181 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +147 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +329 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +295 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +14 -10
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +2 -1
- package/dist/inference/llm.d.ts +2 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +8 -10
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +7 -2
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +2 -0
- package/dist/inference/stt.d.ts +2 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +8 -3
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +7 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -0
- package/dist/inference/tts.d.ts +2 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +8 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +13 -0
- package/dist/inference/utils.d.ts +13 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/llm/chat_context.cjs +108 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +28 -1
- package/dist/llm/chat_context.d.ts +28 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +108 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +43 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +43 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +2 -2
- package/dist/llm/index.d.ts +2 -2
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +3 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +16 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +9 -0
- package/dist/llm/llm.d.ts +9 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +16 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/tool_context.cjs +7 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +10 -2
- package/dist/llm/tool_context.d.ts +10 -2
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +6 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +45 -1
- package/dist/metrics/base.d.ts +45 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +3 -0
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +9 -0
- package/dist/metrics/usage_collector.d.ts +9 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +3 -0
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +9 -0
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +9 -0
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/multi_input_stream.test.cjs +4 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -1
- package/dist/stream/multi_input_stream.test.js +5 -1
- package/dist/stream/multi_input_stream.test.js.map +1 -1
- package/dist/stream/stream_channel.cjs +31 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +4 -2
- package/dist/stream/stream_channel.d.ts +4 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +31 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stt/stt.cjs +34 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +22 -0
- package/dist/stt/stt.d.ts +22 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +34 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/otel_http_exporter.cjs +24 -5
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
- package/dist/telemetry/otel_http_exporter.d.cts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
- package/dist/telemetry/otel_http_exporter.js +24 -5
- package/dist/telemetry/otel_http_exporter.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +5 -5
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +9 -5
- package/dist/telemetry/trace_types.d.ts +9 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +5 -5
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/telemetry/traces.cjs +47 -8
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +47 -8
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +64 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +34 -0
- package/dist/tts/tts.d.ts +34 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +64 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +1 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +1 -0
- package/dist/utils.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +34 -4
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +11 -2
- package/dist/voice/agent.d.ts +11 -2
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +34 -4
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +292 -44
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +27 -6
- package/dist/voice/agent_activity.d.ts +27 -6
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +293 -45
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -48
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +90 -20
- package/dist/voice/agent_session.d.ts +90 -20
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +105 -46
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +287 -6
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +42 -3
- package/dist/voice/audio_recognition.d.ts +42 -3
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +289 -7
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/client_events.cjs +554 -0
- package/dist/voice/client_events.cjs.map +1 -0
- package/dist/voice/client_events.d.cts +195 -0
- package/dist/voice/client_events.d.ts +195 -0
- package/dist/voice/client_events.d.ts.map +1 -0
- package/dist/voice/client_events.js +548 -0
- package/dist/voice/client_events.js.map +1 -0
- package/dist/voice/events.cjs +1 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +8 -5
- package/dist/voice/events.d.ts +8 -5
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +43 -8
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -3
- package/dist/voice/generation.d.ts +3 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/report.cjs +20 -8
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -0
- package/dist/voice/report.d.ts +5 -0
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +20 -8
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +106 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +105 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +16 -41
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +4 -9
- package/dist/voice/room_io/room_io.d.ts +4 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +17 -43
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +3 -0
- package/dist/voice/testing/index.cjs.map +1 -1
- package/dist/voice/testing/index.d.cts +1 -0
- package/dist/voice/testing/index.d.ts +1 -0
- package/dist/voice/testing/index.d.ts.map +1 -1
- package/dist/voice/testing/index.js +2 -0
- package/dist/voice/testing/index.js.map +1 -1
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +97 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +25 -0
- package/dist/voice/turn_config/utils.d.ts +25 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +73 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +86 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +85 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/wire_format.cjs +798 -0
- package/dist/voice/wire_format.cjs.map +1 -0
- package/dist/voice/wire_format.d.cts +5503 -0
- package/dist/voice/wire_format.d.ts +5503 -0
- package/dist/voice/wire_format.d.ts.map +1 -0
- package/dist/voice/wire_format.js +728 -0
- package/dist/voice/wire_format.js.map +1 -0
- package/package.json +2 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +194 -0
- package/src/constants.ts +13 -0
- package/src/index.ts +2 -1
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +187 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +188 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +402 -0
- package/src/inference/llm.ts +9 -12
- package/src/inference/stt.ts +10 -3
- package/src/inference/tts.ts +10 -3
- package/src/inference/utils.ts +29 -1
- package/src/llm/chat_context.test.ts +48 -0
- package/src/llm/chat_context.ts +161 -0
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +16 -0
- package/src/llm/realtime.ts +4 -0
- package/src/llm/tool_context.ts +14 -0
- package/src/metrics/base.ts +48 -1
- package/src/metrics/index.ts +11 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +11 -0
- package/src/metrics/utils.ts +11 -0
- package/src/stream/multi_input_stream.test.ts +6 -1
- package/src/stream/stream_channel.ts +34 -2
- package/src/stt/stt.ts +38 -0
- package/src/telemetry/otel_http_exporter.ts +28 -5
- package/src/telemetry/trace_types.ts +11 -8
- package/src/telemetry/traces.ts +111 -54
- package/src/tts/tts.ts +69 -1
- package/src/utils.ts +5 -0
- package/src/voice/agent.ts +41 -3
- package/src/voice/agent_activity.ts +371 -34
- package/src/voice/agent_session.ts +207 -59
- package/src/voice/audio_recognition.ts +385 -9
- package/src/voice/client_events.ts +838 -0
- package/src/voice/events.ts +14 -4
- package/src/voice/generation.ts +52 -9
- package/src/voice/index.ts +1 -0
- package/src/voice/report.test.ts +117 -0
- package/src/voice/report.ts +29 -6
- package/src/voice/room_io/room_io.ts +21 -64
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +2 -0
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +100 -0
- package/src/voice/turn_config/utils.ts +103 -0
- package/src/voice/wire_format.ts +827 -0
|
@@ -7,8 +7,11 @@ import type { Span } from '@opentelemetry/api';
|
|
|
7
7
|
import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
|
|
8
8
|
import { Heap } from 'heap-js';
|
|
9
9
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
10
|
-
import { ReadableStream } from 'node:stream/web';
|
|
11
|
-
import
|
|
10
|
+
import { ReadableStream, TransformStream } from 'node:stream/web';
|
|
11
|
+
import type { InterruptionDetectionError } from '../inference/interruption/errors.js';
|
|
12
|
+
import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
|
|
13
|
+
import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
|
|
14
|
+
import { type ChatContext, ChatMessage, type MetricsReport } from '../llm/chat_context.js';
|
|
12
15
|
import {
|
|
13
16
|
type ChatItem,
|
|
14
17
|
type FunctionCall,
|
|
@@ -23,12 +26,14 @@ import {
|
|
|
23
26
|
type RealtimeSession,
|
|
24
27
|
type ToolChoice,
|
|
25
28
|
type ToolContext,
|
|
29
|
+
ToolFlag,
|
|
26
30
|
} from '../llm/index.js';
|
|
27
31
|
import type { LLMError } from '../llm/llm.js';
|
|
28
32
|
import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
|
|
29
33
|
import { log } from '../log.js';
|
|
30
34
|
import type {
|
|
31
35
|
EOUMetrics,
|
|
36
|
+
InterruptionMetrics,
|
|
32
37
|
LLMMetrics,
|
|
33
38
|
RealtimeModelMetrics,
|
|
34
39
|
STTMetrics,
|
|
@@ -56,7 +61,6 @@ import {
|
|
|
56
61
|
type EndOfTurnInfo,
|
|
57
62
|
type PreemptiveGenerationInfo,
|
|
58
63
|
type RecognitionHooks,
|
|
59
|
-
type _TurnDetector,
|
|
60
64
|
} from './audio_recognition.js';
|
|
61
65
|
import {
|
|
62
66
|
AgentSessionEventTypes,
|
|
@@ -83,6 +87,12 @@ import { SpeechHandle } from './speech_handle.js';
|
|
|
83
87
|
import { setParticipantSpanAttributes } from './utils.js';
|
|
84
88
|
|
|
85
89
|
export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
|
|
90
|
+
export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
|
|
91
|
+
|
|
92
|
+
interface OnEnterData {
|
|
93
|
+
session: AgentSession;
|
|
94
|
+
agent: Agent;
|
|
95
|
+
}
|
|
86
96
|
|
|
87
97
|
interface PreemptiveGeneration {
|
|
88
98
|
speechHandle: SpeechHandle;
|
|
@@ -94,6 +104,7 @@ interface PreemptiveGeneration {
|
|
|
94
104
|
createdAt: number;
|
|
95
105
|
}
|
|
96
106
|
|
|
107
|
+
// TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes
|
|
97
108
|
export class AgentActivity implements RecognitionHooks {
|
|
98
109
|
agent: Agent;
|
|
99
110
|
agentSession: AgentSession;
|
|
@@ -104,7 +115,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
104
115
|
private audioRecognition?: AudioRecognition;
|
|
105
116
|
private realtimeSession?: RealtimeSession;
|
|
106
117
|
private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
|
|
107
|
-
private turnDetectionMode?:
|
|
118
|
+
private turnDetectionMode?: TurnDetectionMode;
|
|
108
119
|
private logger = log();
|
|
109
120
|
private _schedulingPaused = true;
|
|
110
121
|
private _drainBlockedTasks: Task<any>[] = [];
|
|
@@ -119,6 +130,43 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
119
130
|
// default to null as None, which maps to the default provider tool choice value
|
|
120
131
|
private toolChoice: ToolChoice | null = null;
|
|
121
132
|
private _preemptiveGeneration?: PreemptiveGeneration;
|
|
133
|
+
private interruptionDetector?: AdaptiveInterruptionDetector;
|
|
134
|
+
private isInterruptionDetectionEnabled: boolean;
|
|
135
|
+
private isInterruptionByAudioActivityEnabled: boolean;
|
|
136
|
+
private isDefaultInterruptionByAudioActivityEnabled: boolean;
|
|
137
|
+
|
|
138
|
+
private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent): void =>
|
|
139
|
+
this.onGenerationCreated(ev);
|
|
140
|
+
|
|
141
|
+
private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent): void =>
|
|
142
|
+
this.onInputSpeechStarted(ev);
|
|
143
|
+
|
|
144
|
+
private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent): void =>
|
|
145
|
+
this.onInputSpeechStopped(ev);
|
|
146
|
+
|
|
147
|
+
private readonly onRealtimeInputAudioTranscriptionCompleted = (
|
|
148
|
+
ev: InputTranscriptionCompleted,
|
|
149
|
+
): void => this.onInputAudioTranscriptionCompleted(ev);
|
|
150
|
+
|
|
151
|
+
private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError): void =>
|
|
152
|
+
this.onError(ev);
|
|
153
|
+
|
|
154
|
+
private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
|
|
155
|
+
this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
|
|
159
|
+
this.agentSession.emit(
|
|
160
|
+
AgentSessionEventTypes.MetricsCollected,
|
|
161
|
+
createMetricsCollectedEvent({ metrics: ev }),
|
|
162
|
+
);
|
|
163
|
+
};
|
|
164
|
+
|
|
165
|
+
private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
|
|
166
|
+
const errorEvent = createErrorEvent(ev, this.interruptionDetector);
|
|
167
|
+
this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
|
|
168
|
+
this.agentSession._onError(ev);
|
|
169
|
+
};
|
|
122
170
|
|
|
123
171
|
/** @internal */
|
|
124
172
|
_mainTask?: Task<void>;
|
|
@@ -126,16 +174,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
126
174
|
_onExitTask?: Task<void>;
|
|
127
175
|
_userTurnCompletedTask?: Task<void>;
|
|
128
176
|
|
|
129
|
-
private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
|
|
130
|
-
this.onGenerationCreated(ev);
|
|
131
|
-
private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
|
|
132
|
-
this.onInputSpeechStarted(ev);
|
|
133
|
-
private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
|
|
134
|
-
this.onInputSpeechStopped(ev);
|
|
135
|
-
private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
|
|
136
|
-
this.onInputAudioTranscriptionCompleted(ev);
|
|
137
|
-
private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
|
|
138
|
-
this.onError(ev);
|
|
139
177
|
constructor(agent: Agent, agentSession: AgentSession) {
|
|
140
178
|
this.agent = agent;
|
|
141
179
|
this.agentSession = agentSession;
|
|
@@ -228,6 +266,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
228
266
|
'for more responsive interruption handling.',
|
|
229
267
|
);
|
|
230
268
|
}
|
|
269
|
+
|
|
270
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
271
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
272
|
+
|
|
273
|
+
// this allows taking over audio interruption temporarily until interruption is detected
|
|
274
|
+
// by default is is ture unless turnDetection is manual or realtime_llm
|
|
275
|
+
this.isInterruptionByAudioActivityEnabled =
|
|
276
|
+
this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
|
|
277
|
+
|
|
278
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
231
279
|
}
|
|
232
280
|
|
|
233
281
|
async start(): Promise<void> {
|
|
@@ -312,6 +360,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
312
360
|
}
|
|
313
361
|
}
|
|
314
362
|
|
|
363
|
+
// TODO(parity): Record initial AgentConfigUpdate in chat context
|
|
364
|
+
|
|
315
365
|
// metrics and error handling
|
|
316
366
|
if (this.llm instanceof LLM) {
|
|
317
367
|
this.llm.on('metrics_collected', this.onMetricsCollected);
|
|
@@ -339,8 +389,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
339
389
|
vad: this.vad,
|
|
340
390
|
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
341
391
|
turnDetectionMode: this.turnDetectionMode,
|
|
342
|
-
|
|
343
|
-
|
|
392
|
+
interruptionDetection: this.interruptionDetector,
|
|
393
|
+
minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
|
|
394
|
+
maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
|
|
344
395
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
345
396
|
sttModel: this.stt?.label,
|
|
346
397
|
sttProvider: this.getSttProvider(),
|
|
@@ -354,11 +405,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
354
405
|
if (runOnEnter) {
|
|
355
406
|
this._onEnterTask = this.createSpeechTask({
|
|
356
407
|
taskFn: () =>
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
408
|
+
onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
|
|
409
|
+
tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
410
|
+
name: 'on_enter',
|
|
411
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
412
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
413
|
+
}),
|
|
414
|
+
),
|
|
362
415
|
inlineTask: true,
|
|
363
416
|
name: 'AgentActivity_onEnter',
|
|
364
417
|
});
|
|
@@ -412,7 +465,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
412
465
|
|
|
413
466
|
get allowInterruptions(): boolean {
|
|
414
467
|
// TODO(AJS-51): Allow options to be defined in Agent class
|
|
415
|
-
return this.agentSession.options.
|
|
468
|
+
return this.agentSession.options.turnHandling.interruption?.mode !== false;
|
|
416
469
|
}
|
|
417
470
|
|
|
418
471
|
get useTtsAlignedTranscript(): boolean {
|
|
@@ -429,6 +482,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
429
482
|
return this.agent.toolCtx;
|
|
430
483
|
}
|
|
431
484
|
|
|
485
|
+
/** @internal */
|
|
486
|
+
get inputStartedAt() {
|
|
487
|
+
return this.audioRecognition?.inputStartedAt;
|
|
488
|
+
}
|
|
489
|
+
|
|
432
490
|
async updateChatCtx(chatCtx: ChatContext): Promise<void> {
|
|
433
491
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
434
492
|
|
|
@@ -446,7 +504,27 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
446
504
|
}
|
|
447
505
|
}
|
|
448
506
|
|
|
449
|
-
|
|
507
|
+
// TODO: Add when AgentConfigUpdate is ported to ChatContext.
|
|
508
|
+
async updateTools(tools: ToolContext): Promise<void> {
|
|
509
|
+
this.agent._tools = { ...tools };
|
|
510
|
+
|
|
511
|
+
if (this.realtimeSession) {
|
|
512
|
+
await this.realtimeSession.updateTools(tools);
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
if (this.llm instanceof LLM) {
|
|
516
|
+
// for realtime LLM, we assume the server will remove unvalid tool messages
|
|
517
|
+
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
updateOptions({
|
|
522
|
+
toolChoice,
|
|
523
|
+
turnDetection,
|
|
524
|
+
}: {
|
|
525
|
+
toolChoice?: ToolChoice | null;
|
|
526
|
+
turnDetection?: TurnDetectionMode;
|
|
527
|
+
}): void {
|
|
450
528
|
if (toolChoice !== undefined) {
|
|
451
529
|
this.toolChoice = toolChoice;
|
|
452
530
|
}
|
|
@@ -454,14 +532,46 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
454
532
|
if (this.realtimeSession) {
|
|
455
533
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
456
534
|
}
|
|
535
|
+
|
|
536
|
+
if (turnDetection !== undefined) {
|
|
537
|
+
this.turnDetectionMode = turnDetection;
|
|
538
|
+
this.isDefaultInterruptionByAudioActivityEnabled =
|
|
539
|
+
this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
|
|
540
|
+
|
|
541
|
+
// sync live flag immediately when not speaking so the change takes effect right away
|
|
542
|
+
if (this.agentSession.agentState !== 'speaking') {
|
|
543
|
+
this.isInterruptionByAudioActivityEnabled =
|
|
544
|
+
this.isDefaultInterruptionByAudioActivityEnabled;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
if (this.audioRecognition) {
|
|
549
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
550
|
+
}
|
|
457
551
|
}
|
|
458
552
|
|
|
459
553
|
attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
|
|
460
554
|
void this.audioStream.close();
|
|
461
555
|
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
462
556
|
|
|
557
|
+
// Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
|
|
558
|
+
// than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
|
|
559
|
+
// if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
|
|
560
|
+
// and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
|
|
561
|
+
const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
|
|
562
|
+
transform: (frame, controller) => {
|
|
563
|
+
const shouldDiscardForAecWarmup =
|
|
564
|
+
this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
|
|
565
|
+
if (!shouldDiscardForAecWarmup) {
|
|
566
|
+
controller.enqueue(frame);
|
|
567
|
+
}
|
|
568
|
+
},
|
|
569
|
+
});
|
|
570
|
+
|
|
463
571
|
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
464
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
|
|
572
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
|
|
573
|
+
.pipeThrough(aecWarmupAudioFilter)
|
|
574
|
+
.tee();
|
|
465
575
|
|
|
466
576
|
if (this.realtimeSession) {
|
|
467
577
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
@@ -614,6 +724,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
614
724
|
|
|
615
725
|
if (!this.vad) {
|
|
616
726
|
this.agentSession._updateUserState('speaking');
|
|
727
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
728
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
729
|
+
0,
|
|
730
|
+
Date.now(),
|
|
731
|
+
this.agentSession._userSpeakingSpan,
|
|
732
|
+
);
|
|
733
|
+
}
|
|
617
734
|
}
|
|
618
735
|
|
|
619
736
|
// this.interrupt() is going to raise when allow_interruptions is False,
|
|
@@ -632,6 +749,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
632
749
|
this.logger.info(ev, 'onInputSpeechStopped');
|
|
633
750
|
|
|
634
751
|
if (!this.vad) {
|
|
752
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
753
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
754
|
+
}
|
|
635
755
|
this.agentSession._updateUserState('listening');
|
|
636
756
|
}
|
|
637
757
|
|
|
@@ -705,15 +825,32 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
705
825
|
onStartOfSpeech(ev: VADEvent): void {
|
|
706
826
|
let speechStartTime = Date.now();
|
|
707
827
|
if (ev) {
|
|
708
|
-
|
|
828
|
+
// Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
|
|
829
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
709
830
|
}
|
|
710
831
|
this.agentSession._updateUserState('speaking', speechStartTime);
|
|
832
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
833
|
+
// Pass speechStartTime as the absolute startedAt timestamp.
|
|
834
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
835
|
+
ev.speechDuration,
|
|
836
|
+
speechStartTime,
|
|
837
|
+
this.agentSession._userSpeakingSpan,
|
|
838
|
+
);
|
|
839
|
+
}
|
|
711
840
|
}
|
|
712
841
|
|
|
713
842
|
onEndOfSpeech(ev: VADEvent): void {
|
|
714
843
|
let speechEndTime = Date.now();
|
|
715
844
|
if (ev) {
|
|
716
|
-
|
|
845
|
+
// Subtract both silenceDuration and inferenceDuration to correct for VAD model latency.
|
|
846
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
847
|
+
}
|
|
848
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
849
|
+
// Pass speechEndTime as the absolute endedAt timestamp.
|
|
850
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
851
|
+
speechEndTime,
|
|
852
|
+
this.agentSession._userSpeakingSpan,
|
|
853
|
+
);
|
|
717
854
|
}
|
|
718
855
|
this.agentSession._updateUserState('listening', speechEndTime);
|
|
719
856
|
}
|
|
@@ -724,12 +861,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
724
861
|
return;
|
|
725
862
|
}
|
|
726
863
|
|
|
727
|
-
if (ev.speechDuration >= this.agentSession.options.
|
|
864
|
+
if (ev.speechDuration >= this.agentSession.options.turnHandling.interruption?.minDuration) {
|
|
728
865
|
this.interruptByAudioActivity();
|
|
729
866
|
}
|
|
730
867
|
}
|
|
731
868
|
|
|
732
869
|
private interruptByAudioActivity(): void {
|
|
870
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
871
|
+
return;
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
875
|
+
// Disable interruption from audio activity while AEC warmup is active.
|
|
876
|
+
return;
|
|
877
|
+
}
|
|
878
|
+
|
|
733
879
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
734
880
|
// skip speech handle interruption if server side turn detection is enabled
|
|
735
881
|
return;
|
|
@@ -739,7 +885,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
739
885
|
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
|
|
740
886
|
// - Apply check to all STT results: empty string, undefined, or any length
|
|
741
887
|
// - This ensures consistent behavior across all interruption scenarios
|
|
742
|
-
if (
|
|
888
|
+
if (
|
|
889
|
+
this.stt &&
|
|
890
|
+
this.agentSession.options.turnHandling.interruption?.minWords > 0 &&
|
|
891
|
+
this.audioRecognition
|
|
892
|
+
) {
|
|
743
893
|
const text = this.audioRecognition.currentTranscript;
|
|
744
894
|
// TODO(shubhra): better word splitting for multi-language
|
|
745
895
|
|
|
@@ -749,7 +899,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
749
899
|
|
|
750
900
|
// Only allow interruption if word count meets or exceeds minInterruptionWords
|
|
751
901
|
// This applies to all cases: empty strings, partial speech, and full speech
|
|
752
|
-
if (wordCount < this.agentSession.options.
|
|
902
|
+
if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
|
|
753
903
|
return;
|
|
754
904
|
}
|
|
755
905
|
}
|
|
@@ -770,6 +920,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
770
920
|
}
|
|
771
921
|
}
|
|
772
922
|
|
|
923
|
+
onInterruption(ev: OverlappingSpeechEvent) {
|
|
924
|
+
this.restoreInterruptionByAudioActivity();
|
|
925
|
+
this.interruptByAudioActivity();
|
|
926
|
+
if (this.audioRecognition) {
|
|
927
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
|
|
773
931
|
onInterimTranscript(ev: SpeechEvent): void {
|
|
774
932
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
775
933
|
// skip stt transcription if userTranscription is enabled on the realtime model
|
|
@@ -845,6 +1003,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
845
1003
|
const userMessage = ChatMessage.create({
|
|
846
1004
|
role: 'user',
|
|
847
1005
|
content: info.newTranscript,
|
|
1006
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
848
1007
|
});
|
|
849
1008
|
const chatCtx = this.agent.chatCtx.copy();
|
|
850
1009
|
const speechHandle = this.generateReply({
|
|
@@ -940,16 +1099,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
940
1099
|
this._currentSpeech &&
|
|
941
1100
|
this._currentSpeech.allowInterruptions &&
|
|
942
1101
|
!this._currentSpeech.interrupted &&
|
|
943
|
-
this.agentSession.options.
|
|
1102
|
+
this.agentSession.options.turnHandling.interruption?.minWords > 0
|
|
944
1103
|
) {
|
|
945
1104
|
const wordCount = splitWords(info.newTranscript, true).length;
|
|
946
|
-
if (wordCount < this.agentSession.options.
|
|
1105
|
+
if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
|
|
947
1106
|
// avoid interruption if the new_transcript contains fewer words than minInterruptionWords
|
|
948
1107
|
this.cancelPreemptiveGeneration();
|
|
949
1108
|
this.logger.info(
|
|
950
1109
|
{
|
|
951
1110
|
wordCount,
|
|
952
|
-
minInterruptionWords: this.agentSession.options.
|
|
1111
|
+
minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords,
|
|
953
1112
|
},
|
|
954
1113
|
'skipping user input, word count below minimum interruption threshold',
|
|
955
1114
|
);
|
|
@@ -1129,12 +1288,25 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1129
1288
|
instructions = `${this.agent.instructions}\n${instructions}`;
|
|
1130
1289
|
}
|
|
1131
1290
|
|
|
1291
|
+
// Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
|
|
1292
|
+
const onEnterData = onEnterStorage.getStore();
|
|
1293
|
+
const shouldFilterTools =
|
|
1294
|
+
onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
|
|
1295
|
+
|
|
1296
|
+
const tools = shouldFilterTools
|
|
1297
|
+
? Object.fromEntries(
|
|
1298
|
+
Object.entries(this.agent.toolCtx).filter(
|
|
1299
|
+
([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
|
|
1300
|
+
),
|
|
1301
|
+
)
|
|
1302
|
+
: this.agent.toolCtx;
|
|
1303
|
+
|
|
1132
1304
|
const task = this.createSpeechTask({
|
|
1133
1305
|
taskFn: (abortController: AbortController) =>
|
|
1134
1306
|
this.pipelineReplyTask(
|
|
1135
1307
|
handle,
|
|
1136
1308
|
chatCtx ?? this.agent.chatCtx,
|
|
1137
|
-
|
|
1309
|
+
tools,
|
|
1138
1310
|
{
|
|
1139
1311
|
toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
|
|
1140
1312
|
},
|
|
@@ -1234,6 +1406,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1234
1406
|
let userMessage: ChatMessage | undefined = ChatMessage.create({
|
|
1235
1407
|
role: 'user',
|
|
1236
1408
|
content: info.newTranscript,
|
|
1409
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
1237
1410
|
});
|
|
1238
1411
|
|
|
1239
1412
|
// create a temporary mutable chat context to pass to onUserTurnCompleted
|
|
@@ -1260,6 +1433,24 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1260
1433
|
return;
|
|
1261
1434
|
}
|
|
1262
1435
|
|
|
1436
|
+
const userMetricsReport: MetricsReport = {};
|
|
1437
|
+
if (info.startedSpeakingAt !== undefined) {
|
|
1438
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1000; // ms -> seconds
|
|
1439
|
+
}
|
|
1440
|
+
if (info.stoppedSpeakingAt !== undefined) {
|
|
1441
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1000; // ms -> seconds
|
|
1442
|
+
}
|
|
1443
|
+
if (info.transcriptionDelay !== undefined) {
|
|
1444
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1000; // ms -> seconds
|
|
1445
|
+
}
|
|
1446
|
+
if (info.endOfUtteranceDelay !== undefined) {
|
|
1447
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1000; // ms -> seconds
|
|
1448
|
+
}
|
|
1449
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1000; // ms -> seconds
|
|
1450
|
+
if (userMessage) {
|
|
1451
|
+
userMessage.metrics = userMetricsReport;
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1263
1454
|
let speechHandle: SpeechHandle | undefined;
|
|
1264
1455
|
if (this._preemptiveGeneration !== undefined) {
|
|
1265
1456
|
const preemptive = this._preemptiveGeneration;
|
|
@@ -1272,6 +1463,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1272
1463
|
isSameToolChoice(preemptive.toolChoice, this.toolChoice)
|
|
1273
1464
|
) {
|
|
1274
1465
|
speechHandle = preemptive.speechHandle;
|
|
1466
|
+
// The preemptive userMessage was created without metrics.
|
|
1467
|
+
// Copy the metrics and transcriptConfidence from the new userMessage
|
|
1468
|
+
// to the preemptive message BEFORE scheduling (so the pipeline inserts
|
|
1469
|
+
// the message with metrics already set).
|
|
1470
|
+
if (preemptive.userMessage && userMessage) {
|
|
1471
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1472
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1473
|
+
}
|
|
1275
1474
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
1276
1475
|
this.logger.debug(
|
|
1277
1476
|
{
|
|
@@ -1365,11 +1564,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1365
1564
|
tasks.push(textForwardTask);
|
|
1366
1565
|
}
|
|
1367
1566
|
|
|
1567
|
+
let replyStartedSpeakingAt: number | undefined;
|
|
1568
|
+
let replyTtsGenData: _TTSGenerationData | null = null;
|
|
1569
|
+
|
|
1368
1570
|
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1571
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1369
1572
|
this.agentSession._updateAgentState('speaking', {
|
|
1370
1573
|
startTime: startedSpeakingAt,
|
|
1371
1574
|
otelContext: speechHandle._agentTurnContext,
|
|
1372
1575
|
});
|
|
1576
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1577
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1578
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1579
|
+
}
|
|
1373
1580
|
};
|
|
1374
1581
|
|
|
1375
1582
|
if (!audioOutput) {
|
|
@@ -1387,8 +1594,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1387
1594
|
audioSource,
|
|
1388
1595
|
modelSettings,
|
|
1389
1596
|
replyAbortController,
|
|
1597
|
+
this.tts?.model,
|
|
1598
|
+
this.tts?.provider,
|
|
1390
1599
|
);
|
|
1391
1600
|
tasks.push(ttsTask);
|
|
1601
|
+
replyTtsGenData = ttsGenData;
|
|
1392
1602
|
|
|
1393
1603
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1394
1604
|
ttsGenData.audioStream,
|
|
@@ -1428,10 +1638,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1428
1638
|
}
|
|
1429
1639
|
|
|
1430
1640
|
if (addToChatCtx) {
|
|
1641
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1642
|
+
const replyAssistantMetrics: MetricsReport = {};
|
|
1643
|
+
if (replyTtsGenData?.ttfb !== undefined) {
|
|
1644
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1645
|
+
}
|
|
1646
|
+
if (replyStartedSpeakingAt !== undefined) {
|
|
1647
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds
|
|
1648
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds
|
|
1649
|
+
}
|
|
1650
|
+
|
|
1431
1651
|
const message = ChatMessage.create({
|
|
1432
1652
|
role: 'assistant',
|
|
1433
1653
|
content: textOut?.text || '',
|
|
1434
1654
|
interrupted: speechHandle.interrupted,
|
|
1655
|
+
metrics: replyAssistantMetrics,
|
|
1435
1656
|
});
|
|
1436
1657
|
this.agent._chatCtx.insert(message);
|
|
1437
1658
|
this.agentSession._conversationItemAdded(message);
|
|
@@ -1439,6 +1660,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1439
1660
|
|
|
1440
1661
|
if (this.agentSession.agentState === 'speaking') {
|
|
1441
1662
|
this.agentSession._updateAgentState('listening');
|
|
1663
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1664
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1665
|
+
}
|
|
1666
|
+
this.restoreInterruptionByAudioActivity();
|
|
1442
1667
|
}
|
|
1443
1668
|
}
|
|
1444
1669
|
|
|
@@ -1452,6 +1677,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1452
1677
|
newMessage,
|
|
1453
1678
|
toolsMessages,
|
|
1454
1679
|
span,
|
|
1680
|
+
_previousUserMetrics,
|
|
1455
1681
|
}: {
|
|
1456
1682
|
speechHandle: SpeechHandle;
|
|
1457
1683
|
chatCtx: ChatContext;
|
|
@@ -1462,6 +1688,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1462
1688
|
newMessage?: ChatMessage;
|
|
1463
1689
|
toolsMessages?: ChatItem[];
|
|
1464
1690
|
span: Span;
|
|
1691
|
+
_previousUserMetrics?: MetricsReport;
|
|
1465
1692
|
}): Promise<void> => {
|
|
1466
1693
|
speechHandle._agentTurnContext = otelContext.active();
|
|
1467
1694
|
|
|
@@ -1514,6 +1741,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1514
1741
|
toolCtx,
|
|
1515
1742
|
modelSettings,
|
|
1516
1743
|
replyAbortController,
|
|
1744
|
+
this.llm?.model,
|
|
1745
|
+
this.llm?.provider,
|
|
1517
1746
|
);
|
|
1518
1747
|
tasks.push(llmTask);
|
|
1519
1748
|
|
|
@@ -1530,6 +1759,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1530
1759
|
ttsTextInput,
|
|
1531
1760
|
modelSettings,
|
|
1532
1761
|
replyAbortController,
|
|
1762
|
+
this.tts?.model,
|
|
1763
|
+
this.tts?.provider,
|
|
1533
1764
|
);
|
|
1534
1765
|
tasks.push(ttsTask);
|
|
1535
1766
|
} else {
|
|
@@ -1539,10 +1770,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1539
1770
|
|
|
1540
1771
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1541
1772
|
|
|
1773
|
+
let userMetrics: MetricsReport | undefined = _previousUserMetrics;
|
|
1542
1774
|
// Add new message to actual chat context if the speech is scheduled
|
|
1543
1775
|
if (newMessage && speechHandle.scheduled) {
|
|
1544
1776
|
this.agent._chatCtx.insert(newMessage);
|
|
1545
1777
|
this.agentSession._conversationItemAdded(newMessage);
|
|
1778
|
+
userMetrics = newMessage.metrics;
|
|
1546
1779
|
}
|
|
1547
1780
|
|
|
1548
1781
|
if (speechHandle.interrupted) {
|
|
@@ -1588,11 +1821,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1588
1821
|
textOut = _textOut;
|
|
1589
1822
|
}
|
|
1590
1823
|
|
|
1824
|
+
let agentStartedSpeakingAt: number | undefined;
|
|
1591
1825
|
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1826
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1592
1827
|
this.agentSession._updateAgentState('speaking', {
|
|
1593
1828
|
startTime: startedSpeakingAt,
|
|
1594
1829
|
otelContext: speechHandle._agentTurnContext,
|
|
1595
1830
|
});
|
|
1831
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1832
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1833
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1834
|
+
}
|
|
1596
1835
|
};
|
|
1597
1836
|
|
|
1598
1837
|
let audioOut: _AudioOut | null = null;
|
|
@@ -1649,6 +1888,29 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1649
1888
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1650
1889
|
}
|
|
1651
1890
|
|
|
1891
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1892
|
+
const assistantMetrics: MetricsReport = {};
|
|
1893
|
+
|
|
1894
|
+
if (llmGenData.ttft !== undefined) {
|
|
1895
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft; // already in seconds
|
|
1896
|
+
}
|
|
1897
|
+
if (ttsGenData?.ttfb !== undefined) {
|
|
1898
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb; // already in seconds
|
|
1899
|
+
}
|
|
1900
|
+
if (agentStartedSpeakingAt !== undefined) {
|
|
1901
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds
|
|
1902
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds
|
|
1903
|
+
|
|
1904
|
+
if (userMetrics?.stoppedSpeakingAt !== undefined) {
|
|
1905
|
+
const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt;
|
|
1906
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1907
|
+
span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1908
|
+
}
|
|
1909
|
+
}
|
|
1910
|
+
|
|
1911
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1912
|
+
let hasSpeechMessage = false;
|
|
1913
|
+
|
|
1652
1914
|
// add the tools messages that triggers this reply to the chat context
|
|
1653
1915
|
if (toolsMessages) {
|
|
1654
1916
|
for (const msg of toolsMessages) {
|
|
@@ -1703,45 +1965,54 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1703
1965
|
}
|
|
1704
1966
|
|
|
1705
1967
|
if (forwardedText) {
|
|
1968
|
+
hasSpeechMessage = true;
|
|
1706
1969
|
const message = ChatMessage.create({
|
|
1707
1970
|
role: 'assistant',
|
|
1708
1971
|
content: forwardedText,
|
|
1709
1972
|
id: llmGenData.id,
|
|
1710
1973
|
interrupted: true,
|
|
1711
1974
|
createdAt: replyStartedAt,
|
|
1975
|
+
metrics: assistantMetrics,
|
|
1712
1976
|
});
|
|
1713
1977
|
chatCtx.insert(message);
|
|
1714
1978
|
this.agent._chatCtx.insert(message);
|
|
1715
1979
|
speechHandle._itemAdded([message]);
|
|
1716
1980
|
this.agentSession._conversationItemAdded(message);
|
|
1981
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
1717
1982
|
}
|
|
1718
1983
|
|
|
1719
1984
|
if (this.agentSession.agentState === 'speaking') {
|
|
1720
1985
|
this.agentSession._updateAgentState('listening');
|
|
1986
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1987
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1988
|
+
this.restoreInterruptionByAudioActivity();
|
|
1989
|
+
}
|
|
1721
1990
|
}
|
|
1722
1991
|
|
|
1723
1992
|
this.logger.info(
|
|
1724
1993
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
1725
1994
|
'playout completed with interrupt',
|
|
1726
1995
|
);
|
|
1727
|
-
// TODO(shubhra) add chat message to speech handle
|
|
1728
1996
|
speechHandle._markGenerationDone();
|
|
1729
1997
|
await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1730
1998
|
return;
|
|
1731
1999
|
}
|
|
1732
2000
|
|
|
1733
2001
|
if (textOut && textOut.text) {
|
|
2002
|
+
hasSpeechMessage = true;
|
|
1734
2003
|
const message = ChatMessage.create({
|
|
1735
2004
|
role: 'assistant',
|
|
1736
2005
|
id: llmGenData.id,
|
|
1737
2006
|
interrupted: false,
|
|
1738
2007
|
createdAt: replyStartedAt,
|
|
1739
2008
|
content: textOut.text,
|
|
2009
|
+
metrics: assistantMetrics,
|
|
1740
2010
|
});
|
|
1741
2011
|
chatCtx.insert(message);
|
|
1742
2012
|
this.agent._chatCtx.insert(message);
|
|
1743
2013
|
speechHandle._itemAdded([message]);
|
|
1744
2014
|
this.agentSession._conversationItemAdded(message);
|
|
2015
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
1745
2016
|
this.logger.info(
|
|
1746
2017
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
1747
2018
|
'playout completed without interruption',
|
|
@@ -1752,6 +2023,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1752
2023
|
this.agentSession._updateAgentState('thinking');
|
|
1753
2024
|
} else if (this.agentSession.agentState === 'speaking') {
|
|
1754
2025
|
this.agentSession._updateAgentState('listening');
|
|
2026
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
2027
|
+
{
|
|
2028
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
2029
|
+
this.restoreInterruptionByAudioActivity();
|
|
2030
|
+
}
|
|
2031
|
+
}
|
|
1755
2032
|
}
|
|
1756
2033
|
|
|
1757
2034
|
// mark the playout done before waiting for the tool execution
|
|
@@ -1811,6 +2088,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1811
2088
|
instructions,
|
|
1812
2089
|
undefined,
|
|
1813
2090
|
toolMessages,
|
|
2091
|
+
hasSpeechMessage ? undefined : userMetrics,
|
|
1814
2092
|
),
|
|
1815
2093
|
ownedSpeechHandle: speechHandle,
|
|
1816
2094
|
name: 'AgentActivity.pipelineReply',
|
|
@@ -1844,6 +2122,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1844
2122
|
instructions?: string,
|
|
1845
2123
|
newMessage?: ChatMessage,
|
|
1846
2124
|
toolsMessages?: ChatItem[],
|
|
2125
|
+
_previousUserMetrics?: MetricsReport,
|
|
1847
2126
|
): Promise<void> =>
|
|
1848
2127
|
tracer.startActiveSpan(
|
|
1849
2128
|
async (span) =>
|
|
@@ -1857,6 +2136,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1857
2136
|
newMessage,
|
|
1858
2137
|
toolsMessages,
|
|
1859
2138
|
span,
|
|
2139
|
+
_previousUserMetrics,
|
|
1860
2140
|
}),
|
|
1861
2141
|
{
|
|
1862
2142
|
name: 'agent_turn',
|
|
@@ -2007,6 +2287,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2007
2287
|
ttsTextInput,
|
|
2008
2288
|
modelSettings,
|
|
2009
2289
|
abortController,
|
|
2290
|
+
this.tts?.model,
|
|
2291
|
+
this.tts?.provider,
|
|
2010
2292
|
);
|
|
2011
2293
|
tasks.push(ttsTask);
|
|
2012
2294
|
realtimeAudioResult = ttsGenData.audioStream;
|
|
@@ -2516,6 +2798,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2516
2798
|
if (this._mainTask) {
|
|
2517
2799
|
await this._mainTask.cancelAndWait();
|
|
2518
2800
|
}
|
|
2801
|
+
if (this.interruptionDetector) {
|
|
2802
|
+
this.interruptionDetector.off(
|
|
2803
|
+
'user_overlapping_speech',
|
|
2804
|
+
this.onInterruptionOverlappingSpeech,
|
|
2805
|
+
);
|
|
2806
|
+
this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2807
|
+
this.interruptionDetector.off('error', this.onInterruptionError);
|
|
2808
|
+
}
|
|
2519
2809
|
|
|
2520
2810
|
this.agent._agentActivity = undefined;
|
|
2521
2811
|
} finally {
|
|
@@ -2523,6 +2813,53 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2523
2813
|
}
|
|
2524
2814
|
}
|
|
2525
2815
|
|
|
2816
|
+
private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
|
|
2817
|
+
const interruptionDetection =
|
|
2818
|
+
this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
|
|
2819
|
+
if (
|
|
2820
|
+
!(
|
|
2821
|
+
this.stt &&
|
|
2822
|
+
this.stt.capabilities.alignedTranscript &&
|
|
2823
|
+
this.stt.capabilities.streaming &&
|
|
2824
|
+
this.vad &&
|
|
2825
|
+
this.turnDetection !== 'manual' &&
|
|
2826
|
+
this.turnDetection !== 'realtime_llm' &&
|
|
2827
|
+
!(this.llm instanceof RealtimeModel)
|
|
2828
|
+
)
|
|
2829
|
+
) {
|
|
2830
|
+
if (interruptionDetection === 'adaptive') {
|
|
2831
|
+
this.logger.warn(
|
|
2832
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
|
|
2833
|
+
);
|
|
2834
|
+
return undefined;
|
|
2835
|
+
}
|
|
2836
|
+
}
|
|
2837
|
+
|
|
2838
|
+
if (
|
|
2839
|
+
(interruptionDetection !== undefined && interruptionDetection === false) ||
|
|
2840
|
+
interruptionDetection === 'vad'
|
|
2841
|
+
) {
|
|
2842
|
+
return undefined;
|
|
2843
|
+
}
|
|
2844
|
+
|
|
2845
|
+
try {
|
|
2846
|
+
const detector = new AdaptiveInterruptionDetector();
|
|
2847
|
+
|
|
2848
|
+
detector.on('user_overlapping_speech', this.onInterruptionOverlappingSpeech);
|
|
2849
|
+
detector.on('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2850
|
+
detector.on('error', this.onInterruptionError);
|
|
2851
|
+
|
|
2852
|
+
return detector;
|
|
2853
|
+
} catch (error: unknown) {
|
|
2854
|
+
this.logger.warn({ error }, 'could not instantiate AdaptiveInterruptionDetector');
|
|
2855
|
+
}
|
|
2856
|
+
return undefined;
|
|
2857
|
+
}
|
|
2858
|
+
|
|
2859
|
+
private restoreInterruptionByAudioActivity(): void {
|
|
2860
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2861
|
+
}
|
|
2862
|
+
|
|
2526
2863
|
private async _closeSessionResources(): Promise<void> {
|
|
2527
2864
|
// Unregister event handlers to prevent duplicate metrics
|
|
2528
2865
|
if (this.llm instanceof LLM) {
|