@livekit/agents 1.0.47 → 1.1.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +162 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +138 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/constants.cjs +27 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +9 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +18 -0
- package/dist/constants.js.map +1 -1
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.d.cts +12 -12
- package/dist/inference/api_protos.d.ts +12 -12
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +147 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +121 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +181 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +147 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +329 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +295 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +14 -10
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +2 -1
- package/dist/inference/llm.d.ts +2 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +8 -10
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +7 -2
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +2 -0
- package/dist/inference/stt.d.ts +2 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +8 -3
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +7 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -0
- package/dist/inference/tts.d.ts +2 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +8 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +13 -0
- package/dist/inference/utils.d.ts +13 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/llm/chat_context.cjs +108 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +28 -1
- package/dist/llm/chat_context.d.ts +28 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +108 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +43 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +43 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +2 -2
- package/dist/llm/index.d.ts +2 -2
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +3 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +16 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +9 -0
- package/dist/llm/llm.d.ts +9 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +16 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/tool_context.cjs +7 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +10 -2
- package/dist/llm/tool_context.d.ts +10 -2
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +6 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +45 -1
- package/dist/metrics/base.d.ts +45 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +3 -0
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +9 -0
- package/dist/metrics/usage_collector.d.ts +9 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +3 -0
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +9 -0
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +9 -0
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/multi_input_stream.test.cjs +4 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -1
- package/dist/stream/multi_input_stream.test.js +5 -1
- package/dist/stream/multi_input_stream.test.js.map +1 -1
- package/dist/stream/stream_channel.cjs +31 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +4 -2
- package/dist/stream/stream_channel.d.ts +4 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +31 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stt/stt.cjs +34 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +22 -0
- package/dist/stt/stt.d.ts +22 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +34 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/otel_http_exporter.cjs +24 -5
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
- package/dist/telemetry/otel_http_exporter.d.cts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
- package/dist/telemetry/otel_http_exporter.js +24 -5
- package/dist/telemetry/otel_http_exporter.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +5 -5
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +9 -5
- package/dist/telemetry/trace_types.d.ts +9 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +5 -5
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/telemetry/traces.cjs +47 -8
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +47 -8
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +64 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +34 -0
- package/dist/tts/tts.d.ts +34 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +64 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +1 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +1 -0
- package/dist/utils.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +34 -4
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +11 -2
- package/dist/voice/agent.d.ts +11 -2
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +34 -4
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +292 -44
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +27 -6
- package/dist/voice/agent_activity.d.ts +27 -6
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +293 -45
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -48
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +90 -20
- package/dist/voice/agent_session.d.ts +90 -20
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +105 -46
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +287 -6
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +42 -3
- package/dist/voice/audio_recognition.d.ts +42 -3
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +289 -7
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/client_events.cjs +554 -0
- package/dist/voice/client_events.cjs.map +1 -0
- package/dist/voice/client_events.d.cts +195 -0
- package/dist/voice/client_events.d.ts +195 -0
- package/dist/voice/client_events.d.ts.map +1 -0
- package/dist/voice/client_events.js +548 -0
- package/dist/voice/client_events.js.map +1 -0
- package/dist/voice/events.cjs +1 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +8 -5
- package/dist/voice/events.d.ts +8 -5
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +43 -8
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -3
- package/dist/voice/generation.d.ts +3 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/report.cjs +20 -8
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -0
- package/dist/voice/report.d.ts +5 -0
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +20 -8
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +106 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +105 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +16 -41
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +4 -9
- package/dist/voice/room_io/room_io.d.ts +4 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +17 -43
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +3 -0
- package/dist/voice/testing/index.cjs.map +1 -1
- package/dist/voice/testing/index.d.cts +1 -0
- package/dist/voice/testing/index.d.ts +1 -0
- package/dist/voice/testing/index.d.ts.map +1 -1
- package/dist/voice/testing/index.js +2 -0
- package/dist/voice/testing/index.js.map +1 -1
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +97 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +25 -0
- package/dist/voice/turn_config/utils.d.ts +25 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +73 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +86 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +85 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/wire_format.cjs +798 -0
- package/dist/voice/wire_format.cjs.map +1 -0
- package/dist/voice/wire_format.d.cts +5503 -0
- package/dist/voice/wire_format.d.ts +5503 -0
- package/dist/voice/wire_format.d.ts.map +1 -0
- package/dist/voice/wire_format.js +728 -0
- package/dist/voice/wire_format.js.map +1 -0
- package/package.json +2 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +194 -0
- package/src/constants.ts +13 -0
- package/src/index.ts +2 -1
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +187 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +188 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +402 -0
- package/src/inference/llm.ts +9 -12
- package/src/inference/stt.ts +10 -3
- package/src/inference/tts.ts +10 -3
- package/src/inference/utils.ts +29 -1
- package/src/llm/chat_context.test.ts +48 -0
- package/src/llm/chat_context.ts +161 -0
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +16 -0
- package/src/llm/realtime.ts +4 -0
- package/src/llm/tool_context.ts +14 -0
- package/src/metrics/base.ts +48 -1
- package/src/metrics/index.ts +11 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +11 -0
- package/src/metrics/utils.ts +11 -0
- package/src/stream/multi_input_stream.test.ts +6 -1
- package/src/stream/stream_channel.ts +34 -2
- package/src/stt/stt.ts +38 -0
- package/src/telemetry/otel_http_exporter.ts +28 -5
- package/src/telemetry/trace_types.ts +11 -8
- package/src/telemetry/traces.ts +111 -54
- package/src/tts/tts.ts +69 -1
- package/src/utils.ts +5 -0
- package/src/voice/agent.ts +41 -3
- package/src/voice/agent_activity.ts +371 -34
- package/src/voice/agent_session.ts +207 -59
- package/src/voice/audio_recognition.ts +385 -9
- package/src/voice/client_events.ts +838 -0
- package/src/voice/events.ts +14 -4
- package/src/voice/generation.ts +52 -9
- package/src/voice/index.ts +1 -0
- package/src/voice/report.test.ts +117 -0
- package/src/voice/report.ts +29 -6
- package/src/voice/room_io/room_io.ts +21 -64
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +2 -0
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +100 -0
- package/src/voice/turn_config/utils.ts +103 -0
- package/src/voice/wire_format.ts +827 -0
|
@@ -19,7 +19,8 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
19
19
|
var agent_activity_exports = {};
|
|
20
20
|
__export(agent_activity_exports, {
|
|
21
21
|
AgentActivity: () => AgentActivity,
|
|
22
|
-
agentActivityStorage: () => agentActivityStorage
|
|
22
|
+
agentActivityStorage: () => agentActivityStorage,
|
|
23
|
+
onEnterStorage: () => onEnterStorage
|
|
23
24
|
});
|
|
24
25
|
module.exports = __toCommonJS(agent_activity_exports);
|
|
25
26
|
var import_mutex = require("@livekit/mutex");
|
|
@@ -27,6 +28,7 @@ var import_api = require("@opentelemetry/api");
|
|
|
27
28
|
var import_heap_js = require("heap-js");
|
|
28
29
|
var import_node_async_hooks = require("node:async_hooks");
|
|
29
30
|
var import_web = require("node:stream/web");
|
|
31
|
+
var import_interruption_detector = require("../inference/interruption/interruption_detector.cjs");
|
|
30
32
|
var import_chat_context = require("../llm/chat_context.cjs");
|
|
31
33
|
var import_llm = require("../llm/index.cjs");
|
|
32
34
|
var import_tool_context = require("../llm/tool_context.cjs");
|
|
@@ -46,6 +48,7 @@ var import_generation = require("./generation.cjs");
|
|
|
46
48
|
var import_speech_handle = require("./speech_handle.cjs");
|
|
47
49
|
var import_utils2 = require("./utils.cjs");
|
|
48
50
|
const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
|
|
51
|
+
const onEnterStorage = new import_node_async_hooks.AsyncLocalStorage();
|
|
49
52
|
class AgentActivity {
|
|
50
53
|
agent;
|
|
51
54
|
agentSession;
|
|
@@ -70,16 +73,34 @@ class AgentActivity {
|
|
|
70
73
|
// default to null as None, which maps to the default provider tool choice value
|
|
71
74
|
toolChoice = null;
|
|
72
75
|
_preemptiveGeneration;
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
_userTurnCompletedTask;
|
|
76
|
+
interruptionDetector;
|
|
77
|
+
isInterruptionDetectionEnabled;
|
|
78
|
+
isInterruptionByAudioActivityEnabled;
|
|
79
|
+
isDefaultInterruptionByAudioActivityEnabled;
|
|
78
80
|
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
79
81
|
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
80
82
|
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
81
83
|
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
82
84
|
onModelError = (ev) => this.onError(ev);
|
|
85
|
+
onInterruptionOverlappingSpeech = (ev) => {
|
|
86
|
+
this.agentSession.emit(import_events.AgentSessionEventTypes.UserOverlappingSpeech, ev);
|
|
87
|
+
};
|
|
88
|
+
onInterruptionMetricsCollected = (ev) => {
|
|
89
|
+
this.agentSession.emit(
|
|
90
|
+
import_events.AgentSessionEventTypes.MetricsCollected,
|
|
91
|
+
(0, import_events.createMetricsCollectedEvent)({ metrics: ev })
|
|
92
|
+
);
|
|
93
|
+
};
|
|
94
|
+
onInterruptionError = (ev) => {
|
|
95
|
+
const errorEvent = (0, import_events.createErrorEvent)(ev, this.interruptionDetector);
|
|
96
|
+
this.agentSession.emit(import_events.AgentSessionEventTypes.Error, errorEvent);
|
|
97
|
+
this.agentSession._onError(ev);
|
|
98
|
+
};
|
|
99
|
+
/** @internal */
|
|
100
|
+
_mainTask;
|
|
101
|
+
_onEnterTask;
|
|
102
|
+
_onExitTask;
|
|
103
|
+
_userTurnCompletedTask;
|
|
83
104
|
constructor(agent, agentSession) {
|
|
84
105
|
this.agent = agent;
|
|
85
106
|
this.agentSession = agentSession;
|
|
@@ -138,6 +159,10 @@ class AgentActivity {
|
|
|
138
159
|
"VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
|
|
139
160
|
);
|
|
140
161
|
}
|
|
162
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
163
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
164
|
+
this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
165
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
141
166
|
}
|
|
142
167
|
async start() {
|
|
143
168
|
const unlock = await this.lock.lock();
|
|
@@ -230,8 +255,9 @@ class AgentActivity {
|
|
|
230
255
|
vad: this.vad,
|
|
231
256
|
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
232
257
|
turnDetectionMode: this.turnDetectionMode,
|
|
233
|
-
|
|
234
|
-
|
|
258
|
+
interruptionDetection: this.interruptionDetector,
|
|
259
|
+
minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
|
|
260
|
+
maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
|
|
235
261
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
236
262
|
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
237
263
|
sttProvider: this.getSttProvider(),
|
|
@@ -245,11 +271,14 @@ class AgentActivity {
|
|
|
245
271
|
this._resumeSchedulingTask();
|
|
246
272
|
if (runOnEnter) {
|
|
247
273
|
this._onEnterTask = this.createSpeechTask({
|
|
248
|
-
taskFn: () =>
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
274
|
+
taskFn: () => onEnterStorage.run(
|
|
275
|
+
{ session: this.agentSession, agent: this.agent },
|
|
276
|
+
() => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
277
|
+
name: "on_enter",
|
|
278
|
+
context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
|
|
279
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
280
|
+
})
|
|
281
|
+
),
|
|
253
282
|
inlineTask: true,
|
|
254
283
|
name: "AgentActivity_onEnter"
|
|
255
284
|
});
|
|
@@ -290,7 +319,8 @@ class AgentActivity {
|
|
|
290
319
|
return this.realtimeSession;
|
|
291
320
|
}
|
|
292
321
|
get allowInterruptions() {
|
|
293
|
-
|
|
322
|
+
var _a;
|
|
323
|
+
return ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.mode) !== false;
|
|
294
324
|
}
|
|
295
325
|
get useTtsAlignedTranscript() {
|
|
296
326
|
return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
|
|
@@ -301,6 +331,11 @@ class AgentActivity {
|
|
|
301
331
|
get toolCtx() {
|
|
302
332
|
return this.agent.toolCtx;
|
|
303
333
|
}
|
|
334
|
+
/** @internal */
|
|
335
|
+
get inputStartedAt() {
|
|
336
|
+
var _a;
|
|
337
|
+
return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
|
|
338
|
+
}
|
|
304
339
|
async updateChatCtx(chatCtx) {
|
|
305
340
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
306
341
|
this.agent._chatCtx = chatCtx;
|
|
@@ -315,19 +350,50 @@ class AgentActivity {
|
|
|
315
350
|
});
|
|
316
351
|
}
|
|
317
352
|
}
|
|
318
|
-
|
|
353
|
+
// TODO: Add when AgentConfigUpdate is ported to ChatContext.
|
|
354
|
+
async updateTools(tools) {
|
|
355
|
+
this.agent._tools = { ...tools };
|
|
356
|
+
if (this.realtimeSession) {
|
|
357
|
+
await this.realtimeSession.updateTools(tools);
|
|
358
|
+
}
|
|
359
|
+
if (this.llm instanceof import_llm.LLM) {
|
|
360
|
+
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
updateOptions({
|
|
364
|
+
toolChoice,
|
|
365
|
+
turnDetection
|
|
366
|
+
}) {
|
|
319
367
|
if (toolChoice !== void 0) {
|
|
320
368
|
this.toolChoice = toolChoice;
|
|
321
369
|
}
|
|
322
370
|
if (this.realtimeSession) {
|
|
323
371
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
324
372
|
}
|
|
373
|
+
if (turnDetection !== void 0) {
|
|
374
|
+
this.turnDetectionMode = turnDetection;
|
|
375
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
376
|
+
if (this.agentSession.agentState !== "speaking") {
|
|
377
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
if (this.audioRecognition) {
|
|
381
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
382
|
+
}
|
|
325
383
|
}
|
|
326
384
|
attachAudioInput(audioStream) {
|
|
327
385
|
void this.audioStream.close();
|
|
328
386
|
this.audioStream = new import_multi_input_stream.MultiInputStream();
|
|
387
|
+
const aecWarmupAudioFilter = new import_web.TransformStream({
|
|
388
|
+
transform: (frame, controller) => {
|
|
389
|
+
const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
|
|
390
|
+
if (!shouldDiscardForAecWarmup) {
|
|
391
|
+
controller.enqueue(frame);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
});
|
|
329
395
|
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
330
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
396
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
|
|
331
397
|
if (this.realtimeSession) {
|
|
332
398
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
333
399
|
}
|
|
@@ -433,6 +499,13 @@ class AgentActivity {
|
|
|
433
499
|
this.logger.info("onInputSpeechStarted");
|
|
434
500
|
if (!this.vad) {
|
|
435
501
|
this.agentSession._updateUserState("speaking");
|
|
502
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
503
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
504
|
+
0,
|
|
505
|
+
Date.now(),
|
|
506
|
+
this.agentSession._userSpeakingSpan
|
|
507
|
+
);
|
|
508
|
+
}
|
|
436
509
|
}
|
|
437
510
|
try {
|
|
438
511
|
this.interrupt();
|
|
@@ -446,6 +519,9 @@ class AgentActivity {
|
|
|
446
519
|
onInputSpeechStopped(ev) {
|
|
447
520
|
this.logger.info(ev, "onInputSpeechStopped");
|
|
448
521
|
if (!this.vad) {
|
|
522
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
523
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
524
|
+
}
|
|
449
525
|
this.agentSession._updateUserState("listening");
|
|
450
526
|
}
|
|
451
527
|
if (ev.userTranscriptionEnabled) {
|
|
@@ -507,48 +583,75 @@ class AgentActivity {
|
|
|
507
583
|
onStartOfSpeech(ev) {
|
|
508
584
|
let speechStartTime = Date.now();
|
|
509
585
|
if (ev) {
|
|
510
|
-
speechStartTime = speechStartTime - ev.speechDuration;
|
|
586
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
511
587
|
}
|
|
512
588
|
this.agentSession._updateUserState("speaking", speechStartTime);
|
|
589
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
590
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
591
|
+
ev.speechDuration,
|
|
592
|
+
speechStartTime,
|
|
593
|
+
this.agentSession._userSpeakingSpan
|
|
594
|
+
);
|
|
595
|
+
}
|
|
513
596
|
}
|
|
514
597
|
onEndOfSpeech(ev) {
|
|
515
598
|
let speechEndTime = Date.now();
|
|
516
599
|
if (ev) {
|
|
517
|
-
speechEndTime = speechEndTime - ev.silenceDuration;
|
|
600
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
601
|
+
}
|
|
602
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
603
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
604
|
+
speechEndTime,
|
|
605
|
+
this.agentSession._userSpeakingSpan
|
|
606
|
+
);
|
|
518
607
|
}
|
|
519
608
|
this.agentSession._updateUserState("listening", speechEndTime);
|
|
520
609
|
}
|
|
521
610
|
onVADInferenceDone(ev) {
|
|
611
|
+
var _a;
|
|
522
612
|
if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
|
|
523
613
|
return;
|
|
524
614
|
}
|
|
525
|
-
if (ev.speechDuration >= this.agentSession.options.
|
|
615
|
+
if (ev.speechDuration >= ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
|
|
526
616
|
this.interruptByAudioActivity();
|
|
527
617
|
}
|
|
528
618
|
}
|
|
529
619
|
interruptByAudioActivity() {
|
|
530
|
-
var _a, _b;
|
|
620
|
+
var _a, _b, _c, _d;
|
|
621
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
622
|
+
return;
|
|
623
|
+
}
|
|
624
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
625
|
+
return;
|
|
626
|
+
}
|
|
531
627
|
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
532
628
|
return;
|
|
533
629
|
}
|
|
534
|
-
if (this.stt && this.agentSession.options.
|
|
630
|
+
if (this.stt && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
|
|
535
631
|
const text = this.audioRecognition.currentTranscript;
|
|
536
632
|
const normalizedText = text ?? "";
|
|
537
633
|
const wordCount = (0, import_word.splitWords)(normalizedText, true).length;
|
|
538
|
-
if (wordCount < this.agentSession.options.
|
|
634
|
+
if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
539
635
|
return;
|
|
540
636
|
}
|
|
541
637
|
}
|
|
542
|
-
(
|
|
638
|
+
(_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
|
|
543
639
|
if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
|
|
544
640
|
this.logger.info(
|
|
545
641
|
{ "speech id": this._currentSpeech.id },
|
|
546
642
|
"speech interrupted by audio activity"
|
|
547
643
|
);
|
|
548
|
-
(
|
|
644
|
+
(_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
|
|
549
645
|
this._currentSpeech.interrupt();
|
|
550
646
|
}
|
|
551
647
|
}
|
|
648
|
+
onInterruption(ev) {
|
|
649
|
+
this.restoreInterruptionByAudioActivity();
|
|
650
|
+
this.interruptByAudioActivity();
|
|
651
|
+
if (this.audioRecognition) {
|
|
652
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
|
|
653
|
+
}
|
|
654
|
+
}
|
|
552
655
|
onInterimTranscript(ev) {
|
|
553
656
|
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
554
657
|
return;
|
|
@@ -597,7 +700,8 @@ class AgentActivity {
|
|
|
597
700
|
);
|
|
598
701
|
const userMessage = import_chat_context.ChatMessage.create({
|
|
599
702
|
role: "user",
|
|
600
|
-
content: info.newTranscript
|
|
703
|
+
content: info.newTranscript,
|
|
704
|
+
transcriptConfidence: info.transcriptConfidence
|
|
601
705
|
});
|
|
602
706
|
const chatCtx = this.agent.chatCtx.copy();
|
|
603
707
|
const speechHandle = this.generateReply({
|
|
@@ -655,6 +759,7 @@ class AgentActivity {
|
|
|
655
759
|
return task;
|
|
656
760
|
}
|
|
657
761
|
async onEndOfTurn(info) {
|
|
762
|
+
var _a, _b;
|
|
658
763
|
if (this.schedulingPaused) {
|
|
659
764
|
this.cancelPreemptiveGeneration();
|
|
660
765
|
this.logger.warn(
|
|
@@ -663,14 +768,14 @@ class AgentActivity {
|
|
|
663
768
|
);
|
|
664
769
|
return true;
|
|
665
770
|
}
|
|
666
|
-
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.
|
|
771
|
+
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
|
|
667
772
|
const wordCount = (0, import_word.splitWords)(info.newTranscript, true).length;
|
|
668
|
-
if (wordCount < this.agentSession.options.
|
|
773
|
+
if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
669
774
|
this.cancelPreemptiveGeneration();
|
|
670
775
|
this.logger.info(
|
|
671
776
|
{
|
|
672
777
|
wordCount,
|
|
673
|
-
minInterruptionWords: this.agentSession.options.
|
|
778
|
+
minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords
|
|
674
779
|
},
|
|
675
780
|
"skipping user input, word count below minimum interruption threshold"
|
|
676
781
|
);
|
|
@@ -806,11 +911,18 @@ class AgentActivity {
|
|
|
806
911
|
instructions = `${this.agent.instructions}
|
|
807
912
|
${instructions}`;
|
|
808
913
|
}
|
|
914
|
+
const onEnterData = onEnterStorage.getStore();
|
|
915
|
+
const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
|
|
916
|
+
const tools = shouldFilterTools ? Object.fromEntries(
|
|
917
|
+
Object.entries(this.agent.toolCtx).filter(
|
|
918
|
+
([, fnTool]) => !(fnTool.flags & import_llm.ToolFlag.IGNORE_ON_ENTER)
|
|
919
|
+
)
|
|
920
|
+
) : this.agent.toolCtx;
|
|
809
921
|
const task = this.createSpeechTask({
|
|
810
922
|
taskFn: (abortController) => this.pipelineReplyTask(
|
|
811
923
|
handle,
|
|
812
924
|
chatCtx ?? this.agent.chatCtx,
|
|
813
|
-
|
|
925
|
+
tools,
|
|
814
926
|
{
|
|
815
927
|
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
816
928
|
},
|
|
@@ -882,7 +994,8 @@ ${instructions}`;
|
|
|
882
994
|
}
|
|
883
995
|
let userMessage = import_chat_context.ChatMessage.create({
|
|
884
996
|
role: "user",
|
|
885
|
-
content: info.newTranscript
|
|
997
|
+
content: info.newTranscript,
|
|
998
|
+
transcriptConfidence: info.transcriptConfidence
|
|
886
999
|
});
|
|
887
1000
|
const chatCtx = this.agent.chatCtx.copy();
|
|
888
1001
|
const startTime = Date.now();
|
|
@@ -900,11 +1013,32 @@ ${instructions}`;
|
|
|
900
1013
|
} else if (this.llm === void 0) {
|
|
901
1014
|
return;
|
|
902
1015
|
}
|
|
1016
|
+
const userMetricsReport = {};
|
|
1017
|
+
if (info.startedSpeakingAt !== void 0) {
|
|
1018
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
|
|
1019
|
+
}
|
|
1020
|
+
if (info.stoppedSpeakingAt !== void 0) {
|
|
1021
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
|
|
1022
|
+
}
|
|
1023
|
+
if (info.transcriptionDelay !== void 0) {
|
|
1024
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
|
|
1025
|
+
}
|
|
1026
|
+
if (info.endOfUtteranceDelay !== void 0) {
|
|
1027
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
|
|
1028
|
+
}
|
|
1029
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
|
|
1030
|
+
if (userMessage) {
|
|
1031
|
+
userMessage.metrics = userMetricsReport;
|
|
1032
|
+
}
|
|
903
1033
|
let speechHandle;
|
|
904
1034
|
if (this._preemptiveGeneration !== void 0) {
|
|
905
1035
|
const preemptive = this._preemptiveGeneration;
|
|
906
1036
|
if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && (0, import_tool_context.isSameToolContext)(preemptive.tools, this.tools) && (0, import_tool_context.isSameToolChoice)(preemptive.toolChoice, this.toolChoice)) {
|
|
907
1037
|
speechHandle = preemptive.speechHandle;
|
|
1038
|
+
if (preemptive.userMessage && userMessage) {
|
|
1039
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1040
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1041
|
+
}
|
|
908
1042
|
this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
909
1043
|
this.logger.debug(
|
|
910
1044
|
{
|
|
@@ -938,6 +1072,7 @@ ${instructions}`;
|
|
|
938
1072
|
);
|
|
939
1073
|
}
|
|
940
1074
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
1075
|
+
var _a, _b;
|
|
941
1076
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
942
1077
|
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
943
1078
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
@@ -970,11 +1105,18 @@ ${instructions}`;
|
|
|
970
1105
|
textOut = _textOut;
|
|
971
1106
|
tasks.push(textForwardTask);
|
|
972
1107
|
}
|
|
1108
|
+
let replyStartedSpeakingAt;
|
|
1109
|
+
let replyTtsGenData = null;
|
|
973
1110
|
const onFirstFrame = (startedSpeakingAt) => {
|
|
1111
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
974
1112
|
this.agentSession._updateAgentState("speaking", {
|
|
975
1113
|
startTime: startedSpeakingAt,
|
|
976
1114
|
otelContext: speechHandle._agentTurnContext
|
|
977
1115
|
});
|
|
1116
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1117
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1118
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1119
|
+
}
|
|
978
1120
|
};
|
|
979
1121
|
if (!audioOutput) {
|
|
980
1122
|
if (textOut) {
|
|
@@ -987,9 +1129,12 @@ ${instructions}`;
|
|
|
987
1129
|
(...args) => this.agent.ttsNode(...args),
|
|
988
1130
|
audioSource,
|
|
989
1131
|
modelSettings,
|
|
990
|
-
replyAbortController
|
|
1132
|
+
replyAbortController,
|
|
1133
|
+
(_a = this.tts) == null ? void 0 : _a.model,
|
|
1134
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
991
1135
|
);
|
|
992
1136
|
tasks.push(ttsTask);
|
|
1137
|
+
replyTtsGenData = ttsGenData;
|
|
993
1138
|
const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
|
|
994
1139
|
ttsGenData.audioStream,
|
|
995
1140
|
audioOutput,
|
|
@@ -1021,16 +1166,30 @@ ${instructions}`;
|
|
|
1021
1166
|
}
|
|
1022
1167
|
}
|
|
1023
1168
|
if (addToChatCtx) {
|
|
1169
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1170
|
+
const replyAssistantMetrics = {};
|
|
1171
|
+
if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
|
|
1172
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1173
|
+
}
|
|
1174
|
+
if (replyStartedSpeakingAt !== void 0) {
|
|
1175
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
|
|
1176
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
|
|
1177
|
+
}
|
|
1024
1178
|
const message = import_chat_context.ChatMessage.create({
|
|
1025
1179
|
role: "assistant",
|
|
1026
1180
|
content: (textOut == null ? void 0 : textOut.text) || "",
|
|
1027
|
-
interrupted: speechHandle.interrupted
|
|
1181
|
+
interrupted: speechHandle.interrupted,
|
|
1182
|
+
metrics: replyAssistantMetrics
|
|
1028
1183
|
});
|
|
1029
1184
|
this.agent._chatCtx.insert(message);
|
|
1030
1185
|
this.agentSession._conversationItemAdded(message);
|
|
1031
1186
|
}
|
|
1032
1187
|
if (this.agentSession.agentState === "speaking") {
|
|
1033
1188
|
this.agentSession._updateAgentState("listening");
|
|
1189
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1190
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1191
|
+
}
|
|
1192
|
+
this.restoreInterruptionByAudioActivity();
|
|
1034
1193
|
}
|
|
1035
1194
|
}
|
|
1036
1195
|
_pipelineReplyTaskImpl = async ({
|
|
@@ -1042,9 +1201,10 @@ ${instructions}`;
|
|
|
1042
1201
|
instructions,
|
|
1043
1202
|
newMessage,
|
|
1044
1203
|
toolsMessages,
|
|
1045
|
-
span
|
|
1204
|
+
span,
|
|
1205
|
+
_previousUserMetrics
|
|
1046
1206
|
}) => {
|
|
1047
|
-
var _a, _b;
|
|
1207
|
+
var _a, _b, _c, _d, _e, _f;
|
|
1048
1208
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
1049
1209
|
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1050
1210
|
if (instructions) {
|
|
@@ -1082,7 +1242,9 @@ ${instructions}`;
|
|
|
1082
1242
|
chatCtx,
|
|
1083
1243
|
toolCtx,
|
|
1084
1244
|
modelSettings,
|
|
1085
|
-
replyAbortController
|
|
1245
|
+
replyAbortController,
|
|
1246
|
+
(_b = this.llm) == null ? void 0 : _b.model,
|
|
1247
|
+
(_c = this.llm) == null ? void 0 : _c.provider
|
|
1086
1248
|
);
|
|
1087
1249
|
tasks.push(llmTask);
|
|
1088
1250
|
let ttsTask = null;
|
|
@@ -1095,16 +1257,20 @@ ${instructions}`;
|
|
|
1095
1257
|
(...args) => this.agent.ttsNode(...args),
|
|
1096
1258
|
ttsTextInput,
|
|
1097
1259
|
modelSettings,
|
|
1098
|
-
replyAbortController
|
|
1260
|
+
replyAbortController,
|
|
1261
|
+
(_d = this.tts) == null ? void 0 : _d.model,
|
|
1262
|
+
(_e = this.tts) == null ? void 0 : _e.provider
|
|
1099
1263
|
);
|
|
1100
1264
|
tasks.push(ttsTask);
|
|
1101
1265
|
} else {
|
|
1102
1266
|
llmOutput = llmGenData.textStream;
|
|
1103
1267
|
}
|
|
1104
1268
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1269
|
+
let userMetrics = _previousUserMetrics;
|
|
1105
1270
|
if (newMessage && speechHandle.scheduled) {
|
|
1106
1271
|
this.agent._chatCtx.insert(newMessage);
|
|
1107
1272
|
this.agentSession._conversationItemAdded(newMessage);
|
|
1273
|
+
userMetrics = newMessage.metrics;
|
|
1108
1274
|
}
|
|
1109
1275
|
if (speechHandle.interrupted) {
|
|
1110
1276
|
replyAbortController.abort();
|
|
@@ -1116,7 +1282,7 @@ ${instructions}`;
|
|
|
1116
1282
|
speechHandle._clearAuthorization();
|
|
1117
1283
|
const replyStartedAt = Date.now();
|
|
1118
1284
|
let transcriptionInput = llmOutput;
|
|
1119
|
-
if (this.useTtsAlignedTranscript && ((
|
|
1285
|
+
if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
|
|
1120
1286
|
const timedTextsStream = await Promise.race([
|
|
1121
1287
|
ttsGenData.timedTextsFut.await,
|
|
1122
1288
|
(ttsTask == null ? void 0 : ttsTask.result.catch(
|
|
@@ -1139,11 +1305,17 @@ ${instructions}`;
|
|
|
1139
1305
|
tasks.push(textForwardTask);
|
|
1140
1306
|
textOut = _textOut;
|
|
1141
1307
|
}
|
|
1308
|
+
let agentStartedSpeakingAt;
|
|
1142
1309
|
const onFirstFrame = (startedSpeakingAt) => {
|
|
1310
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1143
1311
|
this.agentSession._updateAgentState("speaking", {
|
|
1144
1312
|
startTime: startedSpeakingAt,
|
|
1145
1313
|
otelContext: speechHandle._agentTurnContext
|
|
1146
1314
|
});
|
|
1315
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1316
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1317
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1318
|
+
}
|
|
1147
1319
|
};
|
|
1148
1320
|
let audioOut = null;
|
|
1149
1321
|
if (audioOutput) {
|
|
@@ -1186,6 +1358,25 @@ ${instructions}`;
|
|
|
1186
1358
|
if (audioOutput) {
|
|
1187
1359
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1188
1360
|
}
|
|
1361
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1362
|
+
const assistantMetrics = {};
|
|
1363
|
+
if (llmGenData.ttft !== void 0) {
|
|
1364
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft;
|
|
1365
|
+
}
|
|
1366
|
+
if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
|
|
1367
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
|
|
1368
|
+
}
|
|
1369
|
+
if (agentStartedSpeakingAt !== void 0) {
|
|
1370
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
|
|
1371
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
|
|
1372
|
+
if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
|
|
1373
|
+
const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
|
|
1374
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1375
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1379
|
+
let hasSpeechMessage = false;
|
|
1189
1380
|
if (toolsMessages) {
|
|
1190
1381
|
for (const msg of toolsMessages) {
|
|
1191
1382
|
msg.createdAt = replyStartedAt;
|
|
@@ -1226,20 +1417,27 @@ ${instructions}`;
|
|
|
1226
1417
|
}
|
|
1227
1418
|
}
|
|
1228
1419
|
if (forwardedText) {
|
|
1420
|
+
hasSpeechMessage = true;
|
|
1229
1421
|
const message = import_chat_context.ChatMessage.create({
|
|
1230
1422
|
role: "assistant",
|
|
1231
1423
|
content: forwardedText,
|
|
1232
1424
|
id: llmGenData.id,
|
|
1233
1425
|
interrupted: true,
|
|
1234
|
-
createdAt: replyStartedAt
|
|
1426
|
+
createdAt: replyStartedAt,
|
|
1427
|
+
metrics: assistantMetrics
|
|
1235
1428
|
});
|
|
1236
1429
|
chatCtx.insert(message);
|
|
1237
1430
|
this.agent._chatCtx.insert(message);
|
|
1238
1431
|
speechHandle._itemAdded([message]);
|
|
1239
1432
|
this.agentSession._conversationItemAdded(message);
|
|
1433
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
1240
1434
|
}
|
|
1241
1435
|
if (this.agentSession.agentState === "speaking") {
|
|
1242
1436
|
this.agentSession._updateAgentState("listening");
|
|
1437
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1438
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1439
|
+
this.restoreInterruptionByAudioActivity();
|
|
1440
|
+
}
|
|
1243
1441
|
}
|
|
1244
1442
|
this.logger.info(
|
|
1245
1443
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
@@ -1250,17 +1448,20 @@ ${instructions}`;
|
|
|
1250
1448
|
return;
|
|
1251
1449
|
}
|
|
1252
1450
|
if (textOut && textOut.text) {
|
|
1451
|
+
hasSpeechMessage = true;
|
|
1253
1452
|
const message = import_chat_context.ChatMessage.create({
|
|
1254
1453
|
role: "assistant",
|
|
1255
1454
|
id: llmGenData.id,
|
|
1256
1455
|
interrupted: false,
|
|
1257
1456
|
createdAt: replyStartedAt,
|
|
1258
|
-
content: textOut.text
|
|
1457
|
+
content: textOut.text,
|
|
1458
|
+
metrics: assistantMetrics
|
|
1259
1459
|
});
|
|
1260
1460
|
chatCtx.insert(message);
|
|
1261
1461
|
this.agent._chatCtx.insert(message);
|
|
1262
1462
|
speechHandle._itemAdded([message]);
|
|
1263
1463
|
this.agentSession._conversationItemAdded(message);
|
|
1464
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
1264
1465
|
this.logger.info(
|
|
1265
1466
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
1266
1467
|
"playout completed without interruption"
|
|
@@ -1270,6 +1471,12 @@ ${instructions}`;
|
|
|
1270
1471
|
this.agentSession._updateAgentState("thinking");
|
|
1271
1472
|
} else if (this.agentSession.agentState === "speaking") {
|
|
1272
1473
|
this.agentSession._updateAgentState("listening");
|
|
1474
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1475
|
+
{
|
|
1476
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1477
|
+
this.restoreInterruptionByAudioActivity();
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1273
1480
|
}
|
|
1274
1481
|
speechHandle._markGenerationDone();
|
|
1275
1482
|
await executeToolsTask.result;
|
|
@@ -1309,7 +1516,8 @@ ${instructions}`;
|
|
|
1309
1516
|
replyAbortController,
|
|
1310
1517
|
instructions,
|
|
1311
1518
|
void 0,
|
|
1312
|
-
toolMessages
|
|
1519
|
+
toolMessages,
|
|
1520
|
+
hasSpeechMessage ? void 0 : userMetrics
|
|
1313
1521
|
),
|
|
1314
1522
|
ownedSpeechHandle: speechHandle,
|
|
1315
1523
|
name: "AgentActivity.pipelineReply"
|
|
@@ -1329,7 +1537,7 @@ ${instructions}`;
|
|
|
1329
1537
|
}
|
|
1330
1538
|
}
|
|
1331
1539
|
};
|
|
1332
|
-
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => import_telemetry.tracer.startActiveSpan(
|
|
1540
|
+
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => import_telemetry.tracer.startActiveSpan(
|
|
1333
1541
|
async (span) => this._pipelineReplyTaskImpl({
|
|
1334
1542
|
speechHandle,
|
|
1335
1543
|
chatCtx,
|
|
@@ -1339,7 +1547,8 @@ ${instructions}`;
|
|
|
1339
1547
|
instructions,
|
|
1340
1548
|
newMessage,
|
|
1341
1549
|
toolsMessages,
|
|
1342
|
-
span
|
|
1550
|
+
span,
|
|
1551
|
+
_previousUserMetrics
|
|
1343
1552
|
}),
|
|
1344
1553
|
{
|
|
1345
1554
|
name: "agent_turn",
|
|
@@ -1405,6 +1614,7 @@ ${instructions}`;
|
|
|
1405
1614
|
});
|
|
1406
1615
|
};
|
|
1407
1616
|
const readMessages = async (abortController, outputs) => {
|
|
1617
|
+
var _a2, _b;
|
|
1408
1618
|
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
1409
1619
|
once: true
|
|
1410
1620
|
});
|
|
@@ -1451,7 +1661,9 @@ ${instructions}`;
|
|
|
1451
1661
|
(...args) => this.agent.ttsNode(...args),
|
|
1452
1662
|
ttsTextInput,
|
|
1453
1663
|
modelSettings,
|
|
1454
|
-
abortController
|
|
1664
|
+
abortController,
|
|
1665
|
+
(_a2 = this.tts) == null ? void 0 : _a2.model,
|
|
1666
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
1455
1667
|
);
|
|
1456
1668
|
tasks.push(ttsTask);
|
|
1457
1669
|
realtimeAudioResult = ttsGenData.audioStream;
|
|
@@ -1843,11 +2055,46 @@ ${instructions}`;
|
|
|
1843
2055
|
if (this._mainTask) {
|
|
1844
2056
|
await this._mainTask.cancelAndWait();
|
|
1845
2057
|
}
|
|
2058
|
+
if (this.interruptionDetector) {
|
|
2059
|
+
this.interruptionDetector.off(
|
|
2060
|
+
"user_overlapping_speech",
|
|
2061
|
+
this.onInterruptionOverlappingSpeech
|
|
2062
|
+
);
|
|
2063
|
+
this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2064
|
+
this.interruptionDetector.off("error", this.onInterruptionError);
|
|
2065
|
+
}
|
|
1846
2066
|
this.agent._agentActivity = void 0;
|
|
1847
2067
|
} finally {
|
|
1848
2068
|
unlock();
|
|
1849
2069
|
}
|
|
1850
2070
|
}
|
|
2071
|
+
resolveInterruptionDetector() {
|
|
2072
|
+
const interruptionDetection = this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
|
|
2073
|
+
if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof import_llm.RealtimeModel))) {
|
|
2074
|
+
if (interruptionDetection === "adaptive") {
|
|
2075
|
+
this.logger.warn(
|
|
2076
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
|
|
2077
|
+
);
|
|
2078
|
+
return void 0;
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
if (interruptionDetection !== void 0 && interruptionDetection === false || interruptionDetection === "vad") {
|
|
2082
|
+
return void 0;
|
|
2083
|
+
}
|
|
2084
|
+
try {
|
|
2085
|
+
const detector = new import_interruption_detector.AdaptiveInterruptionDetector();
|
|
2086
|
+
detector.on("user_overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2087
|
+
detector.on("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2088
|
+
detector.on("error", this.onInterruptionError);
|
|
2089
|
+
return detector;
|
|
2090
|
+
} catch (error) {
|
|
2091
|
+
this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
|
|
2092
|
+
}
|
|
2093
|
+
return void 0;
|
|
2094
|
+
}
|
|
2095
|
+
restoreInterruptionByAudioActivity() {
|
|
2096
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2097
|
+
}
|
|
1851
2098
|
async _closeSessionResources() {
|
|
1852
2099
|
var _a, _b, _c;
|
|
1853
2100
|
if (this.llm instanceof import_llm.LLM) {
|
|
@@ -1890,6 +2137,7 @@ function toOaiToolChoice(toolChoice) {
|
|
|
1890
2137
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1891
2138
|
0 && (module.exports = {
|
|
1892
2139
|
AgentActivity,
|
|
1893
|
-
agentActivityStorage
|
|
2140
|
+
agentActivityStorage,
|
|
2141
|
+
onEnterStorage
|
|
1894
2142
|
});
|
|
1895
2143
|
//# sourceMappingURL=agent_activity.cjs.map
|