@livekit/agents 1.0.47 → 1.1.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +162 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +138 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/constants.cjs +27 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +9 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +18 -0
- package/dist/constants.js.map +1 -1
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.d.cts +12 -12
- package/dist/inference/api_protos.d.ts +12 -12
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +147 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +121 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +181 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +147 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +329 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +295 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +14 -10
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +2 -1
- package/dist/inference/llm.d.ts +2 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +8 -10
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +7 -2
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +2 -0
- package/dist/inference/stt.d.ts +2 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +8 -3
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +7 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -0
- package/dist/inference/tts.d.ts +2 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +8 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +13 -0
- package/dist/inference/utils.d.ts +13 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/llm/chat_context.cjs +108 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +28 -1
- package/dist/llm/chat_context.d.ts +28 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +108 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +43 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +43 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +2 -2
- package/dist/llm/index.d.ts +2 -2
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +3 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +16 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +9 -0
- package/dist/llm/llm.d.ts +9 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +16 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/tool_context.cjs +7 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +10 -2
- package/dist/llm/tool_context.d.ts +10 -2
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +6 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +45 -1
- package/dist/metrics/base.d.ts +45 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +3 -0
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +9 -0
- package/dist/metrics/usage_collector.d.ts +9 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +3 -0
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +9 -0
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +9 -0
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/multi_input_stream.test.cjs +4 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -1
- package/dist/stream/multi_input_stream.test.js +5 -1
- package/dist/stream/multi_input_stream.test.js.map +1 -1
- package/dist/stream/stream_channel.cjs +31 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +4 -2
- package/dist/stream/stream_channel.d.ts +4 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +31 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stt/stt.cjs +34 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +22 -0
- package/dist/stt/stt.d.ts +22 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +34 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/otel_http_exporter.cjs +24 -5
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
- package/dist/telemetry/otel_http_exporter.d.cts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
- package/dist/telemetry/otel_http_exporter.js +24 -5
- package/dist/telemetry/otel_http_exporter.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +5 -5
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +9 -5
- package/dist/telemetry/trace_types.d.ts +9 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +5 -5
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/telemetry/traces.cjs +47 -8
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +47 -8
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +64 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +34 -0
- package/dist/tts/tts.d.ts +34 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +64 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +1 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +1 -0
- package/dist/utils.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +34 -4
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +11 -2
- package/dist/voice/agent.d.ts +11 -2
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +34 -4
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +292 -44
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +27 -6
- package/dist/voice/agent_activity.d.ts +27 -6
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +293 -45
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -48
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +90 -20
- package/dist/voice/agent_session.d.ts +90 -20
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +105 -46
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +287 -6
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +42 -3
- package/dist/voice/audio_recognition.d.ts +42 -3
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +289 -7
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/client_events.cjs +554 -0
- package/dist/voice/client_events.cjs.map +1 -0
- package/dist/voice/client_events.d.cts +195 -0
- package/dist/voice/client_events.d.ts +195 -0
- package/dist/voice/client_events.d.ts.map +1 -0
- package/dist/voice/client_events.js +548 -0
- package/dist/voice/client_events.js.map +1 -0
- package/dist/voice/events.cjs +1 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +8 -5
- package/dist/voice/events.d.ts +8 -5
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +43 -8
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -3
- package/dist/voice/generation.d.ts +3 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/report.cjs +20 -8
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -0
- package/dist/voice/report.d.ts +5 -0
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +20 -8
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +106 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +105 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +16 -41
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +4 -9
- package/dist/voice/room_io/room_io.d.ts +4 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +17 -43
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +3 -0
- package/dist/voice/testing/index.cjs.map +1 -1
- package/dist/voice/testing/index.d.cts +1 -0
- package/dist/voice/testing/index.d.ts +1 -0
- package/dist/voice/testing/index.d.ts.map +1 -1
- package/dist/voice/testing/index.js +2 -0
- package/dist/voice/testing/index.js.map +1 -1
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +97 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +25 -0
- package/dist/voice/turn_config/utils.d.ts +25 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +73 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +86 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +85 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/wire_format.cjs +798 -0
- package/dist/voice/wire_format.cjs.map +1 -0
- package/dist/voice/wire_format.d.cts +5503 -0
- package/dist/voice/wire_format.d.ts +5503 -0
- package/dist/voice/wire_format.d.ts.map +1 -0
- package/dist/voice/wire_format.js +728 -0
- package/dist/voice/wire_format.js.map +1 -0
- package/package.json +2 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +194 -0
- package/src/constants.ts +13 -0
- package/src/index.ts +2 -1
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +187 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +188 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +402 -0
- package/src/inference/llm.ts +9 -12
- package/src/inference/stt.ts +10 -3
- package/src/inference/tts.ts +10 -3
- package/src/inference/utils.ts +29 -1
- package/src/llm/chat_context.test.ts +48 -0
- package/src/llm/chat_context.ts +161 -0
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +16 -0
- package/src/llm/realtime.ts +4 -0
- package/src/llm/tool_context.ts +14 -0
- package/src/metrics/base.ts +48 -1
- package/src/metrics/index.ts +11 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +11 -0
- package/src/metrics/utils.ts +11 -0
- package/src/stream/multi_input_stream.test.ts +6 -1
- package/src/stream/stream_channel.ts +34 -2
- package/src/stt/stt.ts +38 -0
- package/src/telemetry/otel_http_exporter.ts +28 -5
- package/src/telemetry/trace_types.ts +11 -8
- package/src/telemetry/traces.ts +111 -54
- package/src/tts/tts.ts +69 -1
- package/src/utils.ts +5 -0
- package/src/voice/agent.ts +41 -3
- package/src/voice/agent_activity.ts +371 -34
- package/src/voice/agent_session.ts +207 -59
- package/src/voice/audio_recognition.ts +385 -9
- package/src/voice/client_events.ts +838 -0
- package/src/voice/events.ts +14 -4
- package/src/voice/generation.ts +52 -9
- package/src/voice/index.ts +1 -0
- package/src/voice/report.test.ts +117 -0
- package/src/voice/report.ts +29 -6
- package/src/voice/room_io/room_io.ts +21 -64
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +2 -0
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +100 -0
- package/src/voice/turn_config/utils.ts +103 -0
- package/src/voice/wire_format.ts +827 -0
|
@@ -2,11 +2,13 @@ import { Mutex } from "@livekit/mutex";
|
|
|
2
2
|
import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
|
|
3
3
|
import { Heap } from "heap-js";
|
|
4
4
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
5
|
-
import { ReadableStream } from "node:stream/web";
|
|
5
|
+
import { ReadableStream, TransformStream } from "node:stream/web";
|
|
6
|
+
import { AdaptiveInterruptionDetector } from "../inference/interruption/interruption_detector.js";
|
|
6
7
|
import { ChatMessage } from "../llm/chat_context.js";
|
|
7
8
|
import {
|
|
8
9
|
LLM,
|
|
9
|
-
RealtimeModel
|
|
10
|
+
RealtimeModel,
|
|
11
|
+
ToolFlag
|
|
10
12
|
} from "../llm/index.js";
|
|
11
13
|
import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
|
|
12
14
|
import { log } from "../log.js";
|
|
@@ -48,6 +50,7 @@ import {
|
|
|
48
50
|
import { SpeechHandle } from "./speech_handle.js";
|
|
49
51
|
import { setParticipantSpanAttributes } from "./utils.js";
|
|
50
52
|
const agentActivityStorage = new AsyncLocalStorage();
|
|
53
|
+
const onEnterStorage = new AsyncLocalStorage();
|
|
51
54
|
class AgentActivity {
|
|
52
55
|
agent;
|
|
53
56
|
agentSession;
|
|
@@ -72,16 +75,34 @@ class AgentActivity {
|
|
|
72
75
|
// default to null as None, which maps to the default provider tool choice value
|
|
73
76
|
toolChoice = null;
|
|
74
77
|
_preemptiveGeneration;
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
_userTurnCompletedTask;
|
|
78
|
+
interruptionDetector;
|
|
79
|
+
isInterruptionDetectionEnabled;
|
|
80
|
+
isInterruptionByAudioActivityEnabled;
|
|
81
|
+
isDefaultInterruptionByAudioActivityEnabled;
|
|
80
82
|
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
81
83
|
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
82
84
|
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
83
85
|
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
84
86
|
onModelError = (ev) => this.onError(ev);
|
|
87
|
+
onInterruptionOverlappingSpeech = (ev) => {
|
|
88
|
+
this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
|
|
89
|
+
};
|
|
90
|
+
onInterruptionMetricsCollected = (ev) => {
|
|
91
|
+
this.agentSession.emit(
|
|
92
|
+
AgentSessionEventTypes.MetricsCollected,
|
|
93
|
+
createMetricsCollectedEvent({ metrics: ev })
|
|
94
|
+
);
|
|
95
|
+
};
|
|
96
|
+
onInterruptionError = (ev) => {
|
|
97
|
+
const errorEvent = createErrorEvent(ev, this.interruptionDetector);
|
|
98
|
+
this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
|
|
99
|
+
this.agentSession._onError(ev);
|
|
100
|
+
};
|
|
101
|
+
/** @internal */
|
|
102
|
+
_mainTask;
|
|
103
|
+
_onEnterTask;
|
|
104
|
+
_onExitTask;
|
|
105
|
+
_userTurnCompletedTask;
|
|
85
106
|
constructor(agent, agentSession) {
|
|
86
107
|
this.agent = agent;
|
|
87
108
|
this.agentSession = agentSession;
|
|
@@ -140,6 +161,10 @@ class AgentActivity {
|
|
|
140
161
|
"VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
|
|
141
162
|
);
|
|
142
163
|
}
|
|
164
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
165
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
166
|
+
this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
167
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
143
168
|
}
|
|
144
169
|
async start() {
|
|
145
170
|
const unlock = await this.lock.lock();
|
|
@@ -232,8 +257,9 @@ class AgentActivity {
|
|
|
232
257
|
vad: this.vad,
|
|
233
258
|
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
234
259
|
turnDetectionMode: this.turnDetectionMode,
|
|
235
|
-
|
|
236
|
-
|
|
260
|
+
interruptionDetection: this.interruptionDetector,
|
|
261
|
+
minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
|
|
262
|
+
maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
|
|
237
263
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
238
264
|
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
239
265
|
sttProvider: this.getSttProvider(),
|
|
@@ -247,11 +273,14 @@ class AgentActivity {
|
|
|
247
273
|
this._resumeSchedulingTask();
|
|
248
274
|
if (runOnEnter) {
|
|
249
275
|
this._onEnterTask = this.createSpeechTask({
|
|
250
|
-
taskFn: () =>
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
276
|
+
taskFn: () => onEnterStorage.run(
|
|
277
|
+
{ session: this.agentSession, agent: this.agent },
|
|
278
|
+
() => tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
279
|
+
name: "on_enter",
|
|
280
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
281
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
282
|
+
})
|
|
283
|
+
),
|
|
255
284
|
inlineTask: true,
|
|
256
285
|
name: "AgentActivity_onEnter"
|
|
257
286
|
});
|
|
@@ -292,7 +321,8 @@ class AgentActivity {
|
|
|
292
321
|
return this.realtimeSession;
|
|
293
322
|
}
|
|
294
323
|
get allowInterruptions() {
|
|
295
|
-
|
|
324
|
+
var _a;
|
|
325
|
+
return ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.mode) !== false;
|
|
296
326
|
}
|
|
297
327
|
get useTtsAlignedTranscript() {
|
|
298
328
|
return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
|
|
@@ -303,6 +333,11 @@ class AgentActivity {
|
|
|
303
333
|
get toolCtx() {
|
|
304
334
|
return this.agent.toolCtx;
|
|
305
335
|
}
|
|
336
|
+
/** @internal */
|
|
337
|
+
get inputStartedAt() {
|
|
338
|
+
var _a;
|
|
339
|
+
return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
|
|
340
|
+
}
|
|
306
341
|
async updateChatCtx(chatCtx) {
|
|
307
342
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
308
343
|
this.agent._chatCtx = chatCtx;
|
|
@@ -317,19 +352,50 @@ class AgentActivity {
|
|
|
317
352
|
});
|
|
318
353
|
}
|
|
319
354
|
}
|
|
320
|
-
|
|
355
|
+
// TODO: Add when AgentConfigUpdate is ported to ChatContext.
|
|
356
|
+
async updateTools(tools) {
|
|
357
|
+
this.agent._tools = { ...tools };
|
|
358
|
+
if (this.realtimeSession) {
|
|
359
|
+
await this.realtimeSession.updateTools(tools);
|
|
360
|
+
}
|
|
361
|
+
if (this.llm instanceof LLM) {
|
|
362
|
+
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
updateOptions({
|
|
366
|
+
toolChoice,
|
|
367
|
+
turnDetection
|
|
368
|
+
}) {
|
|
321
369
|
if (toolChoice !== void 0) {
|
|
322
370
|
this.toolChoice = toolChoice;
|
|
323
371
|
}
|
|
324
372
|
if (this.realtimeSession) {
|
|
325
373
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
326
374
|
}
|
|
375
|
+
if (turnDetection !== void 0) {
|
|
376
|
+
this.turnDetectionMode = turnDetection;
|
|
377
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
378
|
+
if (this.agentSession.agentState !== "speaking") {
|
|
379
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
if (this.audioRecognition) {
|
|
383
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
384
|
+
}
|
|
327
385
|
}
|
|
328
386
|
attachAudioInput(audioStream) {
|
|
329
387
|
void this.audioStream.close();
|
|
330
388
|
this.audioStream = new MultiInputStream();
|
|
389
|
+
const aecWarmupAudioFilter = new TransformStream({
|
|
390
|
+
transform: (frame, controller) => {
|
|
391
|
+
const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
|
|
392
|
+
if (!shouldDiscardForAecWarmup) {
|
|
393
|
+
controller.enqueue(frame);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
});
|
|
331
397
|
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
332
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
398
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
|
|
333
399
|
if (this.realtimeSession) {
|
|
334
400
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
335
401
|
}
|
|
@@ -435,6 +501,13 @@ class AgentActivity {
|
|
|
435
501
|
this.logger.info("onInputSpeechStarted");
|
|
436
502
|
if (!this.vad) {
|
|
437
503
|
this.agentSession._updateUserState("speaking");
|
|
504
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
505
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
506
|
+
0,
|
|
507
|
+
Date.now(),
|
|
508
|
+
this.agentSession._userSpeakingSpan
|
|
509
|
+
);
|
|
510
|
+
}
|
|
438
511
|
}
|
|
439
512
|
try {
|
|
440
513
|
this.interrupt();
|
|
@@ -448,6 +521,9 @@ class AgentActivity {
|
|
|
448
521
|
onInputSpeechStopped(ev) {
|
|
449
522
|
this.logger.info(ev, "onInputSpeechStopped");
|
|
450
523
|
if (!this.vad) {
|
|
524
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
525
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
526
|
+
}
|
|
451
527
|
this.agentSession._updateUserState("listening");
|
|
452
528
|
}
|
|
453
529
|
if (ev.userTranscriptionEnabled) {
|
|
@@ -509,48 +585,75 @@ class AgentActivity {
|
|
|
509
585
|
onStartOfSpeech(ev) {
|
|
510
586
|
let speechStartTime = Date.now();
|
|
511
587
|
if (ev) {
|
|
512
|
-
speechStartTime = speechStartTime - ev.speechDuration;
|
|
588
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
513
589
|
}
|
|
514
590
|
this.agentSession._updateUserState("speaking", speechStartTime);
|
|
591
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
592
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
593
|
+
ev.speechDuration,
|
|
594
|
+
speechStartTime,
|
|
595
|
+
this.agentSession._userSpeakingSpan
|
|
596
|
+
);
|
|
597
|
+
}
|
|
515
598
|
}
|
|
516
599
|
onEndOfSpeech(ev) {
|
|
517
600
|
let speechEndTime = Date.now();
|
|
518
601
|
if (ev) {
|
|
519
|
-
speechEndTime = speechEndTime - ev.silenceDuration;
|
|
602
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
603
|
+
}
|
|
604
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
605
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
606
|
+
speechEndTime,
|
|
607
|
+
this.agentSession._userSpeakingSpan
|
|
608
|
+
);
|
|
520
609
|
}
|
|
521
610
|
this.agentSession._updateUserState("listening", speechEndTime);
|
|
522
611
|
}
|
|
523
612
|
onVADInferenceDone(ev) {
|
|
613
|
+
var _a;
|
|
524
614
|
if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
|
|
525
615
|
return;
|
|
526
616
|
}
|
|
527
|
-
if (ev.speechDuration >= this.agentSession.options.
|
|
617
|
+
if (ev.speechDuration >= ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
|
|
528
618
|
this.interruptByAudioActivity();
|
|
529
619
|
}
|
|
530
620
|
}
|
|
531
621
|
interruptByAudioActivity() {
|
|
532
|
-
var _a, _b;
|
|
622
|
+
var _a, _b, _c, _d;
|
|
623
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
624
|
+
return;
|
|
625
|
+
}
|
|
626
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
627
|
+
return;
|
|
628
|
+
}
|
|
533
629
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
534
630
|
return;
|
|
535
631
|
}
|
|
536
|
-
if (this.stt && this.agentSession.options.
|
|
632
|
+
if (this.stt && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
|
|
537
633
|
const text = this.audioRecognition.currentTranscript;
|
|
538
634
|
const normalizedText = text ?? "";
|
|
539
635
|
const wordCount = splitWords(normalizedText, true).length;
|
|
540
|
-
if (wordCount < this.agentSession.options.
|
|
636
|
+
if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
541
637
|
return;
|
|
542
638
|
}
|
|
543
639
|
}
|
|
544
|
-
(
|
|
640
|
+
(_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
|
|
545
641
|
if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
|
|
546
642
|
this.logger.info(
|
|
547
643
|
{ "speech id": this._currentSpeech.id },
|
|
548
644
|
"speech interrupted by audio activity"
|
|
549
645
|
);
|
|
550
|
-
(
|
|
646
|
+
(_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
|
|
551
647
|
this._currentSpeech.interrupt();
|
|
552
648
|
}
|
|
553
649
|
}
|
|
650
|
+
onInterruption(ev) {
|
|
651
|
+
this.restoreInterruptionByAudioActivity();
|
|
652
|
+
this.interruptByAudioActivity();
|
|
653
|
+
if (this.audioRecognition) {
|
|
654
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
|
|
655
|
+
}
|
|
656
|
+
}
|
|
554
657
|
onInterimTranscript(ev) {
|
|
555
658
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
556
659
|
return;
|
|
@@ -599,7 +702,8 @@ class AgentActivity {
|
|
|
599
702
|
);
|
|
600
703
|
const userMessage = ChatMessage.create({
|
|
601
704
|
role: "user",
|
|
602
|
-
content: info.newTranscript
|
|
705
|
+
content: info.newTranscript,
|
|
706
|
+
transcriptConfidence: info.transcriptConfidence
|
|
603
707
|
});
|
|
604
708
|
const chatCtx = this.agent.chatCtx.copy();
|
|
605
709
|
const speechHandle = this.generateReply({
|
|
@@ -657,6 +761,7 @@ class AgentActivity {
|
|
|
657
761
|
return task;
|
|
658
762
|
}
|
|
659
763
|
async onEndOfTurn(info) {
|
|
764
|
+
var _a, _b;
|
|
660
765
|
if (this.schedulingPaused) {
|
|
661
766
|
this.cancelPreemptiveGeneration();
|
|
662
767
|
this.logger.warn(
|
|
@@ -665,14 +770,14 @@ class AgentActivity {
|
|
|
665
770
|
);
|
|
666
771
|
return true;
|
|
667
772
|
}
|
|
668
|
-
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.
|
|
773
|
+
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
|
|
669
774
|
const wordCount = splitWords(info.newTranscript, true).length;
|
|
670
|
-
if (wordCount < this.agentSession.options.
|
|
775
|
+
if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
671
776
|
this.cancelPreemptiveGeneration();
|
|
672
777
|
this.logger.info(
|
|
673
778
|
{
|
|
674
779
|
wordCount,
|
|
675
|
-
minInterruptionWords: this.agentSession.options.
|
|
780
|
+
minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords
|
|
676
781
|
},
|
|
677
782
|
"skipping user input, word count below minimum interruption threshold"
|
|
678
783
|
);
|
|
@@ -808,11 +913,18 @@ class AgentActivity {
|
|
|
808
913
|
instructions = `${this.agent.instructions}
|
|
809
914
|
${instructions}`;
|
|
810
915
|
}
|
|
916
|
+
const onEnterData = onEnterStorage.getStore();
|
|
917
|
+
const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
|
|
918
|
+
const tools = shouldFilterTools ? Object.fromEntries(
|
|
919
|
+
Object.entries(this.agent.toolCtx).filter(
|
|
920
|
+
([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER)
|
|
921
|
+
)
|
|
922
|
+
) : this.agent.toolCtx;
|
|
811
923
|
const task = this.createSpeechTask({
|
|
812
924
|
taskFn: (abortController) => this.pipelineReplyTask(
|
|
813
925
|
handle,
|
|
814
926
|
chatCtx ?? this.agent.chatCtx,
|
|
815
|
-
|
|
927
|
+
tools,
|
|
816
928
|
{
|
|
817
929
|
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
818
930
|
},
|
|
@@ -884,7 +996,8 @@ ${instructions}`;
|
|
|
884
996
|
}
|
|
885
997
|
let userMessage = ChatMessage.create({
|
|
886
998
|
role: "user",
|
|
887
|
-
content: info.newTranscript
|
|
999
|
+
content: info.newTranscript,
|
|
1000
|
+
transcriptConfidence: info.transcriptConfidence
|
|
888
1001
|
});
|
|
889
1002
|
const chatCtx = this.agent.chatCtx.copy();
|
|
890
1003
|
const startTime = Date.now();
|
|
@@ -902,11 +1015,32 @@ ${instructions}`;
|
|
|
902
1015
|
} else if (this.llm === void 0) {
|
|
903
1016
|
return;
|
|
904
1017
|
}
|
|
1018
|
+
const userMetricsReport = {};
|
|
1019
|
+
if (info.startedSpeakingAt !== void 0) {
|
|
1020
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
|
|
1021
|
+
}
|
|
1022
|
+
if (info.stoppedSpeakingAt !== void 0) {
|
|
1023
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
|
|
1024
|
+
}
|
|
1025
|
+
if (info.transcriptionDelay !== void 0) {
|
|
1026
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
|
|
1027
|
+
}
|
|
1028
|
+
if (info.endOfUtteranceDelay !== void 0) {
|
|
1029
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
|
|
1030
|
+
}
|
|
1031
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
|
|
1032
|
+
if (userMessage) {
|
|
1033
|
+
userMessage.metrics = userMetricsReport;
|
|
1034
|
+
}
|
|
905
1035
|
let speechHandle;
|
|
906
1036
|
if (this._preemptiveGeneration !== void 0) {
|
|
907
1037
|
const preemptive = this._preemptiveGeneration;
|
|
908
1038
|
if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && isSameToolContext(preemptive.tools, this.tools) && isSameToolChoice(preemptive.toolChoice, this.toolChoice)) {
|
|
909
1039
|
speechHandle = preemptive.speechHandle;
|
|
1040
|
+
if (preemptive.userMessage && userMessage) {
|
|
1041
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1042
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1043
|
+
}
|
|
910
1044
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
911
1045
|
this.logger.debug(
|
|
912
1046
|
{
|
|
@@ -940,6 +1074,7 @@ ${instructions}`;
|
|
|
940
1074
|
);
|
|
941
1075
|
}
|
|
942
1076
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
1077
|
+
var _a, _b;
|
|
943
1078
|
speechHandle._agentTurnContext = otelContext.active();
|
|
944
1079
|
speechHandleStorage.enterWith(speechHandle);
|
|
945
1080
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
@@ -972,11 +1107,18 @@ ${instructions}`;
|
|
|
972
1107
|
textOut = _textOut;
|
|
973
1108
|
tasks.push(textForwardTask);
|
|
974
1109
|
}
|
|
1110
|
+
let replyStartedSpeakingAt;
|
|
1111
|
+
let replyTtsGenData = null;
|
|
975
1112
|
const onFirstFrame = (startedSpeakingAt) => {
|
|
1113
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
976
1114
|
this.agentSession._updateAgentState("speaking", {
|
|
977
1115
|
startTime: startedSpeakingAt,
|
|
978
1116
|
otelContext: speechHandle._agentTurnContext
|
|
979
1117
|
});
|
|
1118
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1119
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1120
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1121
|
+
}
|
|
980
1122
|
};
|
|
981
1123
|
if (!audioOutput) {
|
|
982
1124
|
if (textOut) {
|
|
@@ -989,9 +1131,12 @@ ${instructions}`;
|
|
|
989
1131
|
(...args) => this.agent.ttsNode(...args),
|
|
990
1132
|
audioSource,
|
|
991
1133
|
modelSettings,
|
|
992
|
-
replyAbortController
|
|
1134
|
+
replyAbortController,
|
|
1135
|
+
(_a = this.tts) == null ? void 0 : _a.model,
|
|
1136
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
993
1137
|
);
|
|
994
1138
|
tasks.push(ttsTask);
|
|
1139
|
+
replyTtsGenData = ttsGenData;
|
|
995
1140
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
996
1141
|
ttsGenData.audioStream,
|
|
997
1142
|
audioOutput,
|
|
@@ -1023,16 +1168,30 @@ ${instructions}`;
|
|
|
1023
1168
|
}
|
|
1024
1169
|
}
|
|
1025
1170
|
if (addToChatCtx) {
|
|
1171
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1172
|
+
const replyAssistantMetrics = {};
|
|
1173
|
+
if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
|
|
1174
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1175
|
+
}
|
|
1176
|
+
if (replyStartedSpeakingAt !== void 0) {
|
|
1177
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
|
|
1178
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
|
|
1179
|
+
}
|
|
1026
1180
|
const message = ChatMessage.create({
|
|
1027
1181
|
role: "assistant",
|
|
1028
1182
|
content: (textOut == null ? void 0 : textOut.text) || "",
|
|
1029
|
-
interrupted: speechHandle.interrupted
|
|
1183
|
+
interrupted: speechHandle.interrupted,
|
|
1184
|
+
metrics: replyAssistantMetrics
|
|
1030
1185
|
});
|
|
1031
1186
|
this.agent._chatCtx.insert(message);
|
|
1032
1187
|
this.agentSession._conversationItemAdded(message);
|
|
1033
1188
|
}
|
|
1034
1189
|
if (this.agentSession.agentState === "speaking") {
|
|
1035
1190
|
this.agentSession._updateAgentState("listening");
|
|
1191
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1192
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1193
|
+
}
|
|
1194
|
+
this.restoreInterruptionByAudioActivity();
|
|
1036
1195
|
}
|
|
1037
1196
|
}
|
|
1038
1197
|
_pipelineReplyTaskImpl = async ({
|
|
@@ -1044,9 +1203,10 @@ ${instructions}`;
|
|
|
1044
1203
|
instructions,
|
|
1045
1204
|
newMessage,
|
|
1046
1205
|
toolsMessages,
|
|
1047
|
-
span
|
|
1206
|
+
span,
|
|
1207
|
+
_previousUserMetrics
|
|
1048
1208
|
}) => {
|
|
1049
|
-
var _a, _b;
|
|
1209
|
+
var _a, _b, _c, _d, _e, _f;
|
|
1050
1210
|
speechHandle._agentTurnContext = otelContext.active();
|
|
1051
1211
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1052
1212
|
if (instructions) {
|
|
@@ -1084,7 +1244,9 @@ ${instructions}`;
|
|
|
1084
1244
|
chatCtx,
|
|
1085
1245
|
toolCtx,
|
|
1086
1246
|
modelSettings,
|
|
1087
|
-
replyAbortController
|
|
1247
|
+
replyAbortController,
|
|
1248
|
+
(_b = this.llm) == null ? void 0 : _b.model,
|
|
1249
|
+
(_c = this.llm) == null ? void 0 : _c.provider
|
|
1088
1250
|
);
|
|
1089
1251
|
tasks.push(llmTask);
|
|
1090
1252
|
let ttsTask = null;
|
|
@@ -1097,16 +1259,20 @@ ${instructions}`;
|
|
|
1097
1259
|
(...args) => this.agent.ttsNode(...args),
|
|
1098
1260
|
ttsTextInput,
|
|
1099
1261
|
modelSettings,
|
|
1100
|
-
replyAbortController
|
|
1262
|
+
replyAbortController,
|
|
1263
|
+
(_d = this.tts) == null ? void 0 : _d.model,
|
|
1264
|
+
(_e = this.tts) == null ? void 0 : _e.provider
|
|
1101
1265
|
);
|
|
1102
1266
|
tasks.push(ttsTask);
|
|
1103
1267
|
} else {
|
|
1104
1268
|
llmOutput = llmGenData.textStream;
|
|
1105
1269
|
}
|
|
1106
1270
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1271
|
+
let userMetrics = _previousUserMetrics;
|
|
1107
1272
|
if (newMessage && speechHandle.scheduled) {
|
|
1108
1273
|
this.agent._chatCtx.insert(newMessage);
|
|
1109
1274
|
this.agentSession._conversationItemAdded(newMessage);
|
|
1275
|
+
userMetrics = newMessage.metrics;
|
|
1110
1276
|
}
|
|
1111
1277
|
if (speechHandle.interrupted) {
|
|
1112
1278
|
replyAbortController.abort();
|
|
@@ -1118,7 +1284,7 @@ ${instructions}`;
|
|
|
1118
1284
|
speechHandle._clearAuthorization();
|
|
1119
1285
|
const replyStartedAt = Date.now();
|
|
1120
1286
|
let transcriptionInput = llmOutput;
|
|
1121
|
-
if (this.useTtsAlignedTranscript && ((
|
|
1287
|
+
if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
|
|
1122
1288
|
const timedTextsStream = await Promise.race([
|
|
1123
1289
|
ttsGenData.timedTextsFut.await,
|
|
1124
1290
|
(ttsTask == null ? void 0 : ttsTask.result.catch(
|
|
@@ -1141,11 +1307,17 @@ ${instructions}`;
|
|
|
1141
1307
|
tasks.push(textForwardTask);
|
|
1142
1308
|
textOut = _textOut;
|
|
1143
1309
|
}
|
|
1310
|
+
let agentStartedSpeakingAt;
|
|
1144
1311
|
const onFirstFrame = (startedSpeakingAt) => {
|
|
1312
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1145
1313
|
this.agentSession._updateAgentState("speaking", {
|
|
1146
1314
|
startTime: startedSpeakingAt,
|
|
1147
1315
|
otelContext: speechHandle._agentTurnContext
|
|
1148
1316
|
});
|
|
1317
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1318
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1319
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1320
|
+
}
|
|
1149
1321
|
};
|
|
1150
1322
|
let audioOut = null;
|
|
1151
1323
|
if (audioOutput) {
|
|
@@ -1188,6 +1360,25 @@ ${instructions}`;
|
|
|
1188
1360
|
if (audioOutput) {
|
|
1189
1361
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1190
1362
|
}
|
|
1363
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1364
|
+
const assistantMetrics = {};
|
|
1365
|
+
if (llmGenData.ttft !== void 0) {
|
|
1366
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft;
|
|
1367
|
+
}
|
|
1368
|
+
if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
|
|
1369
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
|
|
1370
|
+
}
|
|
1371
|
+
if (agentStartedSpeakingAt !== void 0) {
|
|
1372
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
|
|
1373
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
|
|
1374
|
+
if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
|
|
1375
|
+
const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
|
|
1376
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1377
|
+
span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1378
|
+
}
|
|
1379
|
+
}
|
|
1380
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1381
|
+
let hasSpeechMessage = false;
|
|
1191
1382
|
if (toolsMessages) {
|
|
1192
1383
|
for (const msg of toolsMessages) {
|
|
1193
1384
|
msg.createdAt = replyStartedAt;
|
|
@@ -1228,20 +1419,27 @@ ${instructions}`;
|
|
|
1228
1419
|
}
|
|
1229
1420
|
}
|
|
1230
1421
|
if (forwardedText) {
|
|
1422
|
+
hasSpeechMessage = true;
|
|
1231
1423
|
const message = ChatMessage.create({
|
|
1232
1424
|
role: "assistant",
|
|
1233
1425
|
content: forwardedText,
|
|
1234
1426
|
id: llmGenData.id,
|
|
1235
1427
|
interrupted: true,
|
|
1236
|
-
createdAt: replyStartedAt
|
|
1428
|
+
createdAt: replyStartedAt,
|
|
1429
|
+
metrics: assistantMetrics
|
|
1237
1430
|
});
|
|
1238
1431
|
chatCtx.insert(message);
|
|
1239
1432
|
this.agent._chatCtx.insert(message);
|
|
1240
1433
|
speechHandle._itemAdded([message]);
|
|
1241
1434
|
this.agentSession._conversationItemAdded(message);
|
|
1435
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
1242
1436
|
}
|
|
1243
1437
|
if (this.agentSession.agentState === "speaking") {
|
|
1244
1438
|
this.agentSession._updateAgentState("listening");
|
|
1439
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1440
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1441
|
+
this.restoreInterruptionByAudioActivity();
|
|
1442
|
+
}
|
|
1245
1443
|
}
|
|
1246
1444
|
this.logger.info(
|
|
1247
1445
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
@@ -1252,17 +1450,20 @@ ${instructions}`;
|
|
|
1252
1450
|
return;
|
|
1253
1451
|
}
|
|
1254
1452
|
if (textOut && textOut.text) {
|
|
1453
|
+
hasSpeechMessage = true;
|
|
1255
1454
|
const message = ChatMessage.create({
|
|
1256
1455
|
role: "assistant",
|
|
1257
1456
|
id: llmGenData.id,
|
|
1258
1457
|
interrupted: false,
|
|
1259
1458
|
createdAt: replyStartedAt,
|
|
1260
|
-
content: textOut.text
|
|
1459
|
+
content: textOut.text,
|
|
1460
|
+
metrics: assistantMetrics
|
|
1261
1461
|
});
|
|
1262
1462
|
chatCtx.insert(message);
|
|
1263
1463
|
this.agent._chatCtx.insert(message);
|
|
1264
1464
|
speechHandle._itemAdded([message]);
|
|
1265
1465
|
this.agentSession._conversationItemAdded(message);
|
|
1466
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
1266
1467
|
this.logger.info(
|
|
1267
1468
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
1268
1469
|
"playout completed without interruption"
|
|
@@ -1272,6 +1473,12 @@ ${instructions}`;
|
|
|
1272
1473
|
this.agentSession._updateAgentState("thinking");
|
|
1273
1474
|
} else if (this.agentSession.agentState === "speaking") {
|
|
1274
1475
|
this.agentSession._updateAgentState("listening");
|
|
1476
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1477
|
+
{
|
|
1478
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1479
|
+
this.restoreInterruptionByAudioActivity();
|
|
1480
|
+
}
|
|
1481
|
+
}
|
|
1275
1482
|
}
|
|
1276
1483
|
speechHandle._markGenerationDone();
|
|
1277
1484
|
await executeToolsTask.result;
|
|
@@ -1311,7 +1518,8 @@ ${instructions}`;
|
|
|
1311
1518
|
replyAbortController,
|
|
1312
1519
|
instructions,
|
|
1313
1520
|
void 0,
|
|
1314
|
-
toolMessages
|
|
1521
|
+
toolMessages,
|
|
1522
|
+
hasSpeechMessage ? void 0 : userMetrics
|
|
1315
1523
|
),
|
|
1316
1524
|
ownedSpeechHandle: speechHandle,
|
|
1317
1525
|
name: "AgentActivity.pipelineReply"
|
|
@@ -1331,7 +1539,7 @@ ${instructions}`;
|
|
|
1331
1539
|
}
|
|
1332
1540
|
}
|
|
1333
1541
|
};
|
|
1334
|
-
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => tracer.startActiveSpan(
|
|
1542
|
+
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => tracer.startActiveSpan(
|
|
1335
1543
|
async (span) => this._pipelineReplyTaskImpl({
|
|
1336
1544
|
speechHandle,
|
|
1337
1545
|
chatCtx,
|
|
@@ -1341,7 +1549,8 @@ ${instructions}`;
|
|
|
1341
1549
|
instructions,
|
|
1342
1550
|
newMessage,
|
|
1343
1551
|
toolsMessages,
|
|
1344
|
-
span
|
|
1552
|
+
span,
|
|
1553
|
+
_previousUserMetrics
|
|
1345
1554
|
}),
|
|
1346
1555
|
{
|
|
1347
1556
|
name: "agent_turn",
|
|
@@ -1407,6 +1616,7 @@ ${instructions}`;
|
|
|
1407
1616
|
});
|
|
1408
1617
|
};
|
|
1409
1618
|
const readMessages = async (abortController, outputs) => {
|
|
1619
|
+
var _a2, _b;
|
|
1410
1620
|
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
1411
1621
|
once: true
|
|
1412
1622
|
});
|
|
@@ -1453,7 +1663,9 @@ ${instructions}`;
|
|
|
1453
1663
|
(...args) => this.agent.ttsNode(...args),
|
|
1454
1664
|
ttsTextInput,
|
|
1455
1665
|
modelSettings,
|
|
1456
|
-
abortController
|
|
1666
|
+
abortController,
|
|
1667
|
+
(_a2 = this.tts) == null ? void 0 : _a2.model,
|
|
1668
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
1457
1669
|
);
|
|
1458
1670
|
tasks.push(ttsTask);
|
|
1459
1671
|
realtimeAudioResult = ttsGenData.audioStream;
|
|
@@ -1845,11 +2057,46 @@ ${instructions}`;
|
|
|
1845
2057
|
if (this._mainTask) {
|
|
1846
2058
|
await this._mainTask.cancelAndWait();
|
|
1847
2059
|
}
|
|
2060
|
+
if (this.interruptionDetector) {
|
|
2061
|
+
this.interruptionDetector.off(
|
|
2062
|
+
"user_overlapping_speech",
|
|
2063
|
+
this.onInterruptionOverlappingSpeech
|
|
2064
|
+
);
|
|
2065
|
+
this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2066
|
+
this.interruptionDetector.off("error", this.onInterruptionError);
|
|
2067
|
+
}
|
|
1848
2068
|
this.agent._agentActivity = void 0;
|
|
1849
2069
|
} finally {
|
|
1850
2070
|
unlock();
|
|
1851
2071
|
}
|
|
1852
2072
|
}
|
|
2073
|
+
resolveInterruptionDetector() {
|
|
2074
|
+
const interruptionDetection = this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
|
|
2075
|
+
if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof RealtimeModel))) {
|
|
2076
|
+
if (interruptionDetection === "adaptive") {
|
|
2077
|
+
this.logger.warn(
|
|
2078
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
|
|
2079
|
+
);
|
|
2080
|
+
return void 0;
|
|
2081
|
+
}
|
|
2082
|
+
}
|
|
2083
|
+
if (interruptionDetection !== void 0 && interruptionDetection === false || interruptionDetection === "vad") {
|
|
2084
|
+
return void 0;
|
|
2085
|
+
}
|
|
2086
|
+
try {
|
|
2087
|
+
const detector = new AdaptiveInterruptionDetector();
|
|
2088
|
+
detector.on("user_overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2089
|
+
detector.on("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2090
|
+
detector.on("error", this.onInterruptionError);
|
|
2091
|
+
return detector;
|
|
2092
|
+
} catch (error) {
|
|
2093
|
+
this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
|
|
2094
|
+
}
|
|
2095
|
+
return void 0;
|
|
2096
|
+
}
|
|
2097
|
+
restoreInterruptionByAudioActivity() {
|
|
2098
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2099
|
+
}
|
|
1853
2100
|
async _closeSessionResources() {
|
|
1854
2101
|
var _a, _b, _c;
|
|
1855
2102
|
if (this.llm instanceof LLM) {
|
|
@@ -1891,6 +2138,7 @@ function toOaiToolChoice(toolChoice) {
|
|
|
1891
2138
|
}
|
|
1892
2139
|
export {
|
|
1893
2140
|
AgentActivity,
|
|
1894
|
-
agentActivityStorage
|
|
2141
|
+
agentActivityStorage,
|
|
2142
|
+
onEnterStorage
|
|
1895
2143
|
};
|
|
1896
2144
|
//# sourceMappingURL=agent_activity.js.map
|