@livekit/agents 1.0.48 → 1.1.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.cjs +27 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +9 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +18 -0
- package/dist/constants.js.map +1 -1
- package/dist/inference/api_protos.d.cts +71 -71
- package/dist/inference/api_protos.d.ts +71 -71
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +147 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +121 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +181 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +147 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +329 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +295 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +14 -10
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +2 -1
- package/dist/inference/llm.d.ts +2 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +8 -10
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +7 -2
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +2 -0
- package/dist/inference/stt.d.ts +2 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +8 -3
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +7 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -0
- package/dist/inference/tts.d.ts +2 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +8 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +13 -0
- package/dist/inference/utils.d.ts +13 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/llm/chat_context.cjs +20 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +19 -1
- package/dist/llm/chat_context.d.ts +19 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +20 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +16 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +9 -0
- package/dist/llm/llm.d.ts +9 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +16 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +45 -1
- package/dist/metrics/base.d.ts +45 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +3 -0
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +9 -0
- package/dist/metrics/usage_collector.d.ts +9 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +3 -0
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +9 -0
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +9 -0
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/multi_input_stream.test.cjs +4 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -1
- package/dist/stream/multi_input_stream.test.js +5 -1
- package/dist/stream/multi_input_stream.test.js.map +1 -1
- package/dist/stream/stream_channel.cjs +31 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +4 -2
- package/dist/stream/stream_channel.d.ts +4 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +31 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stt/stt.cjs +34 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +22 -0
- package/dist/stt/stt.d.ts +22 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +34 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/otel_http_exporter.cjs +24 -5
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
- package/dist/telemetry/otel_http_exporter.d.cts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
- package/dist/telemetry/otel_http_exporter.js +24 -5
- package/dist/telemetry/otel_http_exporter.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +5 -5
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +9 -5
- package/dist/telemetry/trace_types.d.ts +9 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +5 -5
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/telemetry/traces.cjs +47 -8
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +47 -8
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +64 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +34 -0
- package/dist/tts/tts.d.ts +34 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +64 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +25 -4
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +10 -2
- package/dist/voice/agent.d.ts +10 -2
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +25 -4
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +261 -36
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +20 -6
- package/dist/voice/agent_activity.d.ts +20 -6
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +262 -37
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -48
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +90 -20
- package/dist/voice/agent_session.d.ts +90 -20
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +105 -46
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +287 -6
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +42 -3
- package/dist/voice/audio_recognition.d.ts +42 -3
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +289 -7
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/client_events.cjs +554 -0
- package/dist/voice/client_events.cjs.map +1 -0
- package/dist/voice/client_events.d.cts +195 -0
- package/dist/voice/client_events.d.ts +195 -0
- package/dist/voice/client_events.d.ts.map +1 -0
- package/dist/voice/client_events.js +548 -0
- package/dist/voice/client_events.js.map +1 -0
- package/dist/voice/events.cjs +1 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +8 -5
- package/dist/voice/events.d.ts +8 -5
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +43 -8
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -3
- package/dist/voice/generation.d.ts +3 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/report.cjs +20 -8
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -0
- package/dist/voice/report.d.ts +5 -0
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +20 -8
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +106 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +105 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +5 -39
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +4 -9
- package/dist/voice/room_io/room_io.d.ts +4 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +5 -40
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +97 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +25 -0
- package/dist/voice/turn_config/utils.d.ts +25 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +73 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +86 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +85 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/wire_format.cjs +798 -0
- package/dist/voice/wire_format.cjs.map +1 -0
- package/dist/voice/wire_format.d.cts +5503 -0
- package/dist/voice/wire_format.d.ts +5503 -0
- package/dist/voice/wire_format.d.ts.map +1 -0
- package/dist/voice/wire_format.js +728 -0
- package/dist/voice/wire_format.js.map +1 -0
- package/package.json +2 -1
- package/src/constants.ts +13 -0
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +187 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +188 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +402 -0
- package/src/inference/llm.ts +9 -12
- package/src/inference/stt.ts +10 -3
- package/src/inference/tts.ts +10 -3
- package/src/inference/utils.ts +29 -1
- package/src/llm/chat_context.ts +40 -2
- package/src/llm/index.ts +1 -0
- package/src/llm/llm.ts +16 -0
- package/src/llm/realtime.ts +4 -0
- package/src/metrics/base.ts +48 -1
- package/src/metrics/index.ts +11 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +11 -0
- package/src/metrics/utils.ts +11 -0
- package/src/stream/multi_input_stream.test.ts +6 -1
- package/src/stream/stream_channel.ts +34 -2
- package/src/stt/stt.ts +38 -0
- package/src/telemetry/otel_http_exporter.ts +28 -5
- package/src/telemetry/trace_types.ts +11 -8
- package/src/telemetry/traces.ts +111 -54
- package/src/tts/tts.ts +69 -1
- package/src/voice/agent.ts +30 -3
- package/src/voice/agent_activity.ts +327 -28
- package/src/voice/agent_session.ts +207 -59
- package/src/voice/audio_recognition.ts +385 -9
- package/src/voice/client_events.ts +838 -0
- package/src/voice/events.ts +14 -4
- package/src/voice/generation.ts +52 -9
- package/src/voice/index.ts +1 -0
- package/src/voice/report.test.ts +117 -0
- package/src/voice/report.ts +29 -6
- package/src/voice/room_io/room_io.ts +7 -61
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +100 -0
- package/src/voice/turn_config/utils.ts +103 -0
- package/src/voice/wire_format.ts +827 -0
|
@@ -28,6 +28,7 @@ var import_api = require("@opentelemetry/api");
|
|
|
28
28
|
var import_heap_js = require("heap-js");
|
|
29
29
|
var import_node_async_hooks = require("node:async_hooks");
|
|
30
30
|
var import_web = require("node:stream/web");
|
|
31
|
+
var import_interruption_detector = require("../inference/interruption/interruption_detector.cjs");
|
|
31
32
|
var import_chat_context = require("../llm/chat_context.cjs");
|
|
32
33
|
var import_llm = require("../llm/index.cjs");
|
|
33
34
|
var import_tool_context = require("../llm/tool_context.cjs");
|
|
@@ -72,16 +73,34 @@ class AgentActivity {
|
|
|
72
73
|
// default to null as None, which maps to the default provider tool choice value
|
|
73
74
|
toolChoice = null;
|
|
74
75
|
_preemptiveGeneration;
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
_userTurnCompletedTask;
|
|
76
|
+
interruptionDetector;
|
|
77
|
+
isInterruptionDetectionEnabled;
|
|
78
|
+
isInterruptionByAudioActivityEnabled;
|
|
79
|
+
isDefaultInterruptionByAudioActivityEnabled;
|
|
80
80
|
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
81
81
|
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
82
82
|
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
83
83
|
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
84
84
|
onModelError = (ev) => this.onError(ev);
|
|
85
|
+
onInterruptionOverlappingSpeech = (ev) => {
|
|
86
|
+
this.agentSession.emit(import_events.AgentSessionEventTypes.UserOverlappingSpeech, ev);
|
|
87
|
+
};
|
|
88
|
+
onInterruptionMetricsCollected = (ev) => {
|
|
89
|
+
this.agentSession.emit(
|
|
90
|
+
import_events.AgentSessionEventTypes.MetricsCollected,
|
|
91
|
+
(0, import_events.createMetricsCollectedEvent)({ metrics: ev })
|
|
92
|
+
);
|
|
93
|
+
};
|
|
94
|
+
onInterruptionError = (ev) => {
|
|
95
|
+
const errorEvent = (0, import_events.createErrorEvent)(ev, this.interruptionDetector);
|
|
96
|
+
this.agentSession.emit(import_events.AgentSessionEventTypes.Error, errorEvent);
|
|
97
|
+
this.agentSession._onError(ev);
|
|
98
|
+
};
|
|
99
|
+
/** @internal */
|
|
100
|
+
_mainTask;
|
|
101
|
+
_onEnterTask;
|
|
102
|
+
_onExitTask;
|
|
103
|
+
_userTurnCompletedTask;
|
|
85
104
|
constructor(agent, agentSession) {
|
|
86
105
|
this.agent = agent;
|
|
87
106
|
this.agentSession = agentSession;
|
|
@@ -140,6 +159,10 @@ class AgentActivity {
|
|
|
140
159
|
"VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
|
|
141
160
|
);
|
|
142
161
|
}
|
|
162
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
163
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
164
|
+
this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
165
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
143
166
|
}
|
|
144
167
|
async start() {
|
|
145
168
|
const unlock = await this.lock.lock();
|
|
@@ -232,8 +255,9 @@ class AgentActivity {
|
|
|
232
255
|
vad: this.vad,
|
|
233
256
|
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
234
257
|
turnDetectionMode: this.turnDetectionMode,
|
|
235
|
-
|
|
236
|
-
|
|
258
|
+
interruptionDetection: this.interruptionDetector,
|
|
259
|
+
minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
|
|
260
|
+
maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
|
|
237
261
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
238
262
|
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
239
263
|
sttProvider: this.getSttProvider(),
|
|
@@ -295,7 +319,8 @@ class AgentActivity {
|
|
|
295
319
|
return this.realtimeSession;
|
|
296
320
|
}
|
|
297
321
|
get allowInterruptions() {
|
|
298
|
-
|
|
322
|
+
var _a;
|
|
323
|
+
return ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.mode) !== false;
|
|
299
324
|
}
|
|
300
325
|
get useTtsAlignedTranscript() {
|
|
301
326
|
return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
|
|
@@ -306,6 +331,11 @@ class AgentActivity {
|
|
|
306
331
|
get toolCtx() {
|
|
307
332
|
return this.agent.toolCtx;
|
|
308
333
|
}
|
|
334
|
+
/** @internal */
|
|
335
|
+
get inputStartedAt() {
|
|
336
|
+
var _a;
|
|
337
|
+
return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
|
|
338
|
+
}
|
|
309
339
|
async updateChatCtx(chatCtx) {
|
|
310
340
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
311
341
|
this.agent._chatCtx = chatCtx;
|
|
@@ -330,19 +360,40 @@ class AgentActivity {
|
|
|
330
360
|
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
331
361
|
}
|
|
332
362
|
}
|
|
333
|
-
updateOptions({
|
|
363
|
+
updateOptions({
|
|
364
|
+
toolChoice,
|
|
365
|
+
turnDetection
|
|
366
|
+
}) {
|
|
334
367
|
if (toolChoice !== void 0) {
|
|
335
368
|
this.toolChoice = toolChoice;
|
|
336
369
|
}
|
|
337
370
|
if (this.realtimeSession) {
|
|
338
371
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
339
372
|
}
|
|
373
|
+
if (turnDetection !== void 0) {
|
|
374
|
+
this.turnDetectionMode = turnDetection;
|
|
375
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
376
|
+
if (this.agentSession.agentState !== "speaking") {
|
|
377
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
if (this.audioRecognition) {
|
|
381
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
382
|
+
}
|
|
340
383
|
}
|
|
341
384
|
attachAudioInput(audioStream) {
|
|
342
385
|
void this.audioStream.close();
|
|
343
386
|
this.audioStream = new import_multi_input_stream.MultiInputStream();
|
|
387
|
+
const aecWarmupAudioFilter = new import_web.TransformStream({
|
|
388
|
+
transform: (frame, controller) => {
|
|
389
|
+
const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
|
|
390
|
+
if (!shouldDiscardForAecWarmup) {
|
|
391
|
+
controller.enqueue(frame);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
});
|
|
344
395
|
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
345
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
396
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
|
|
346
397
|
if (this.realtimeSession) {
|
|
347
398
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
348
399
|
}
|
|
@@ -448,6 +499,13 @@ class AgentActivity {
|
|
|
448
499
|
this.logger.info("onInputSpeechStarted");
|
|
449
500
|
if (!this.vad) {
|
|
450
501
|
this.agentSession._updateUserState("speaking");
|
|
502
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
503
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
504
|
+
0,
|
|
505
|
+
Date.now(),
|
|
506
|
+
this.agentSession._userSpeakingSpan
|
|
507
|
+
);
|
|
508
|
+
}
|
|
451
509
|
}
|
|
452
510
|
try {
|
|
453
511
|
this.interrupt();
|
|
@@ -461,6 +519,9 @@ class AgentActivity {
|
|
|
461
519
|
onInputSpeechStopped(ev) {
|
|
462
520
|
this.logger.info(ev, "onInputSpeechStopped");
|
|
463
521
|
if (!this.vad) {
|
|
522
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
523
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
524
|
+
}
|
|
464
525
|
this.agentSession._updateUserState("listening");
|
|
465
526
|
}
|
|
466
527
|
if (ev.userTranscriptionEnabled) {
|
|
@@ -522,48 +583,75 @@ class AgentActivity {
|
|
|
522
583
|
onStartOfSpeech(ev) {
|
|
523
584
|
let speechStartTime = Date.now();
|
|
524
585
|
if (ev) {
|
|
525
|
-
speechStartTime = speechStartTime - ev.speechDuration;
|
|
586
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
526
587
|
}
|
|
527
588
|
this.agentSession._updateUserState("speaking", speechStartTime);
|
|
589
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
590
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
591
|
+
ev.speechDuration,
|
|
592
|
+
speechStartTime,
|
|
593
|
+
this.agentSession._userSpeakingSpan
|
|
594
|
+
);
|
|
595
|
+
}
|
|
528
596
|
}
|
|
529
597
|
onEndOfSpeech(ev) {
|
|
530
598
|
let speechEndTime = Date.now();
|
|
531
599
|
if (ev) {
|
|
532
|
-
speechEndTime = speechEndTime - ev.silenceDuration;
|
|
600
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
601
|
+
}
|
|
602
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
603
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
604
|
+
speechEndTime,
|
|
605
|
+
this.agentSession._userSpeakingSpan
|
|
606
|
+
);
|
|
533
607
|
}
|
|
534
608
|
this.agentSession._updateUserState("listening", speechEndTime);
|
|
535
609
|
}
|
|
536
610
|
onVADInferenceDone(ev) {
|
|
611
|
+
var _a;
|
|
537
612
|
if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
|
|
538
613
|
return;
|
|
539
614
|
}
|
|
540
|
-
if (ev.speechDuration >= this.agentSession.options.
|
|
615
|
+
if (ev.speechDuration >= ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
|
|
541
616
|
this.interruptByAudioActivity();
|
|
542
617
|
}
|
|
543
618
|
}
|
|
544
619
|
interruptByAudioActivity() {
|
|
545
|
-
var _a, _b;
|
|
620
|
+
var _a, _b, _c, _d;
|
|
621
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
622
|
+
return;
|
|
623
|
+
}
|
|
624
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
625
|
+
return;
|
|
626
|
+
}
|
|
546
627
|
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
547
628
|
return;
|
|
548
629
|
}
|
|
549
|
-
if (this.stt && this.agentSession.options.
|
|
630
|
+
if (this.stt && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
|
|
550
631
|
const text = this.audioRecognition.currentTranscript;
|
|
551
632
|
const normalizedText = text ?? "";
|
|
552
633
|
const wordCount = (0, import_word.splitWords)(normalizedText, true).length;
|
|
553
|
-
if (wordCount < this.agentSession.options.
|
|
634
|
+
if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
554
635
|
return;
|
|
555
636
|
}
|
|
556
637
|
}
|
|
557
|
-
(
|
|
638
|
+
(_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
|
|
558
639
|
if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
|
|
559
640
|
this.logger.info(
|
|
560
641
|
{ "speech id": this._currentSpeech.id },
|
|
561
642
|
"speech interrupted by audio activity"
|
|
562
643
|
);
|
|
563
|
-
(
|
|
644
|
+
(_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
|
|
564
645
|
this._currentSpeech.interrupt();
|
|
565
646
|
}
|
|
566
647
|
}
|
|
648
|
+
onInterruption(ev) {
|
|
649
|
+
this.restoreInterruptionByAudioActivity();
|
|
650
|
+
this.interruptByAudioActivity();
|
|
651
|
+
if (this.audioRecognition) {
|
|
652
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
|
|
653
|
+
}
|
|
654
|
+
}
|
|
567
655
|
onInterimTranscript(ev) {
|
|
568
656
|
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
569
657
|
return;
|
|
@@ -612,7 +700,8 @@ class AgentActivity {
|
|
|
612
700
|
);
|
|
613
701
|
const userMessage = import_chat_context.ChatMessage.create({
|
|
614
702
|
role: "user",
|
|
615
|
-
content: info.newTranscript
|
|
703
|
+
content: info.newTranscript,
|
|
704
|
+
transcriptConfidence: info.transcriptConfidence
|
|
616
705
|
});
|
|
617
706
|
const chatCtx = this.agent.chatCtx.copy();
|
|
618
707
|
const speechHandle = this.generateReply({
|
|
@@ -670,6 +759,7 @@ class AgentActivity {
|
|
|
670
759
|
return task;
|
|
671
760
|
}
|
|
672
761
|
async onEndOfTurn(info) {
|
|
762
|
+
var _a, _b;
|
|
673
763
|
if (this.schedulingPaused) {
|
|
674
764
|
this.cancelPreemptiveGeneration();
|
|
675
765
|
this.logger.warn(
|
|
@@ -678,14 +768,14 @@ class AgentActivity {
|
|
|
678
768
|
);
|
|
679
769
|
return true;
|
|
680
770
|
}
|
|
681
|
-
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.
|
|
771
|
+
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
|
|
682
772
|
const wordCount = (0, import_word.splitWords)(info.newTranscript, true).length;
|
|
683
|
-
if (wordCount < this.agentSession.options.
|
|
773
|
+
if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
684
774
|
this.cancelPreemptiveGeneration();
|
|
685
775
|
this.logger.info(
|
|
686
776
|
{
|
|
687
777
|
wordCount,
|
|
688
|
-
minInterruptionWords: this.agentSession.options.
|
|
778
|
+
minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords
|
|
689
779
|
},
|
|
690
780
|
"skipping user input, word count below minimum interruption threshold"
|
|
691
781
|
);
|
|
@@ -904,7 +994,8 @@ ${instructions}`;
|
|
|
904
994
|
}
|
|
905
995
|
let userMessage = import_chat_context.ChatMessage.create({
|
|
906
996
|
role: "user",
|
|
907
|
-
content: info.newTranscript
|
|
997
|
+
content: info.newTranscript,
|
|
998
|
+
transcriptConfidence: info.transcriptConfidence
|
|
908
999
|
});
|
|
909
1000
|
const chatCtx = this.agent.chatCtx.copy();
|
|
910
1001
|
const startTime = Date.now();
|
|
@@ -922,11 +1013,32 @@ ${instructions}`;
|
|
|
922
1013
|
} else if (this.llm === void 0) {
|
|
923
1014
|
return;
|
|
924
1015
|
}
|
|
1016
|
+
const userMetricsReport = {};
|
|
1017
|
+
if (info.startedSpeakingAt !== void 0) {
|
|
1018
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
|
|
1019
|
+
}
|
|
1020
|
+
if (info.stoppedSpeakingAt !== void 0) {
|
|
1021
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
|
|
1022
|
+
}
|
|
1023
|
+
if (info.transcriptionDelay !== void 0) {
|
|
1024
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
|
|
1025
|
+
}
|
|
1026
|
+
if (info.endOfUtteranceDelay !== void 0) {
|
|
1027
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
|
|
1028
|
+
}
|
|
1029
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
|
|
1030
|
+
if (userMessage) {
|
|
1031
|
+
userMessage.metrics = userMetricsReport;
|
|
1032
|
+
}
|
|
925
1033
|
let speechHandle;
|
|
926
1034
|
if (this._preemptiveGeneration !== void 0) {
|
|
927
1035
|
const preemptive = this._preemptiveGeneration;
|
|
928
1036
|
if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && (0, import_tool_context.isSameToolContext)(preemptive.tools, this.tools) && (0, import_tool_context.isSameToolChoice)(preemptive.toolChoice, this.toolChoice)) {
|
|
929
1037
|
speechHandle = preemptive.speechHandle;
|
|
1038
|
+
if (preemptive.userMessage && userMessage) {
|
|
1039
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1040
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1041
|
+
}
|
|
930
1042
|
this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
931
1043
|
this.logger.debug(
|
|
932
1044
|
{
|
|
@@ -960,6 +1072,7 @@ ${instructions}`;
|
|
|
960
1072
|
);
|
|
961
1073
|
}
|
|
962
1074
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
1075
|
+
var _a, _b;
|
|
963
1076
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
964
1077
|
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
965
1078
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
@@ -992,11 +1105,18 @@ ${instructions}`;
|
|
|
992
1105
|
textOut = _textOut;
|
|
993
1106
|
tasks.push(textForwardTask);
|
|
994
1107
|
}
|
|
1108
|
+
let replyStartedSpeakingAt;
|
|
1109
|
+
let replyTtsGenData = null;
|
|
995
1110
|
const onFirstFrame = (startedSpeakingAt) => {
|
|
1111
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
996
1112
|
this.agentSession._updateAgentState("speaking", {
|
|
997
1113
|
startTime: startedSpeakingAt,
|
|
998
1114
|
otelContext: speechHandle._agentTurnContext
|
|
999
1115
|
});
|
|
1116
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1117
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1118
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1119
|
+
}
|
|
1000
1120
|
};
|
|
1001
1121
|
if (!audioOutput) {
|
|
1002
1122
|
if (textOut) {
|
|
@@ -1009,9 +1129,12 @@ ${instructions}`;
|
|
|
1009
1129
|
(...args) => this.agent.ttsNode(...args),
|
|
1010
1130
|
audioSource,
|
|
1011
1131
|
modelSettings,
|
|
1012
|
-
replyAbortController
|
|
1132
|
+
replyAbortController,
|
|
1133
|
+
(_a = this.tts) == null ? void 0 : _a.model,
|
|
1134
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
1013
1135
|
);
|
|
1014
1136
|
tasks.push(ttsTask);
|
|
1137
|
+
replyTtsGenData = ttsGenData;
|
|
1015
1138
|
const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
|
|
1016
1139
|
ttsGenData.audioStream,
|
|
1017
1140
|
audioOutput,
|
|
@@ -1043,16 +1166,30 @@ ${instructions}`;
|
|
|
1043
1166
|
}
|
|
1044
1167
|
}
|
|
1045
1168
|
if (addToChatCtx) {
|
|
1169
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1170
|
+
const replyAssistantMetrics = {};
|
|
1171
|
+
if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
|
|
1172
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1173
|
+
}
|
|
1174
|
+
if (replyStartedSpeakingAt !== void 0) {
|
|
1175
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
|
|
1176
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
|
|
1177
|
+
}
|
|
1046
1178
|
const message = import_chat_context.ChatMessage.create({
|
|
1047
1179
|
role: "assistant",
|
|
1048
1180
|
content: (textOut == null ? void 0 : textOut.text) || "",
|
|
1049
|
-
interrupted: speechHandle.interrupted
|
|
1181
|
+
interrupted: speechHandle.interrupted,
|
|
1182
|
+
metrics: replyAssistantMetrics
|
|
1050
1183
|
});
|
|
1051
1184
|
this.agent._chatCtx.insert(message);
|
|
1052
1185
|
this.agentSession._conversationItemAdded(message);
|
|
1053
1186
|
}
|
|
1054
1187
|
if (this.agentSession.agentState === "speaking") {
|
|
1055
1188
|
this.agentSession._updateAgentState("listening");
|
|
1189
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1190
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1191
|
+
}
|
|
1192
|
+
this.restoreInterruptionByAudioActivity();
|
|
1056
1193
|
}
|
|
1057
1194
|
}
|
|
1058
1195
|
_pipelineReplyTaskImpl = async ({
|
|
@@ -1064,9 +1201,10 @@ ${instructions}`;
|
|
|
1064
1201
|
instructions,
|
|
1065
1202
|
newMessage,
|
|
1066
1203
|
toolsMessages,
|
|
1067
|
-
span
|
|
1204
|
+
span,
|
|
1205
|
+
_previousUserMetrics
|
|
1068
1206
|
}) => {
|
|
1069
|
-
var _a, _b;
|
|
1207
|
+
var _a, _b, _c, _d, _e, _f;
|
|
1070
1208
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
1071
1209
|
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1072
1210
|
if (instructions) {
|
|
@@ -1104,7 +1242,9 @@ ${instructions}`;
|
|
|
1104
1242
|
chatCtx,
|
|
1105
1243
|
toolCtx,
|
|
1106
1244
|
modelSettings,
|
|
1107
|
-
replyAbortController
|
|
1245
|
+
replyAbortController,
|
|
1246
|
+
(_b = this.llm) == null ? void 0 : _b.model,
|
|
1247
|
+
(_c = this.llm) == null ? void 0 : _c.provider
|
|
1108
1248
|
);
|
|
1109
1249
|
tasks.push(llmTask);
|
|
1110
1250
|
let ttsTask = null;
|
|
@@ -1117,16 +1257,20 @@ ${instructions}`;
|
|
|
1117
1257
|
(...args) => this.agent.ttsNode(...args),
|
|
1118
1258
|
ttsTextInput,
|
|
1119
1259
|
modelSettings,
|
|
1120
|
-
replyAbortController
|
|
1260
|
+
replyAbortController,
|
|
1261
|
+
(_d = this.tts) == null ? void 0 : _d.model,
|
|
1262
|
+
(_e = this.tts) == null ? void 0 : _e.provider
|
|
1121
1263
|
);
|
|
1122
1264
|
tasks.push(ttsTask);
|
|
1123
1265
|
} else {
|
|
1124
1266
|
llmOutput = llmGenData.textStream;
|
|
1125
1267
|
}
|
|
1126
1268
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1269
|
+
let userMetrics = _previousUserMetrics;
|
|
1127
1270
|
if (newMessage && speechHandle.scheduled) {
|
|
1128
1271
|
this.agent._chatCtx.insert(newMessage);
|
|
1129
1272
|
this.agentSession._conversationItemAdded(newMessage);
|
|
1273
|
+
userMetrics = newMessage.metrics;
|
|
1130
1274
|
}
|
|
1131
1275
|
if (speechHandle.interrupted) {
|
|
1132
1276
|
replyAbortController.abort();
|
|
@@ -1138,7 +1282,7 @@ ${instructions}`;
|
|
|
1138
1282
|
speechHandle._clearAuthorization();
|
|
1139
1283
|
const replyStartedAt = Date.now();
|
|
1140
1284
|
let transcriptionInput = llmOutput;
|
|
1141
|
-
if (this.useTtsAlignedTranscript && ((
|
|
1285
|
+
if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
|
|
1142
1286
|
const timedTextsStream = await Promise.race([
|
|
1143
1287
|
ttsGenData.timedTextsFut.await,
|
|
1144
1288
|
(ttsTask == null ? void 0 : ttsTask.result.catch(
|
|
@@ -1161,11 +1305,17 @@ ${instructions}`;
|
|
|
1161
1305
|
tasks.push(textForwardTask);
|
|
1162
1306
|
textOut = _textOut;
|
|
1163
1307
|
}
|
|
1308
|
+
let agentStartedSpeakingAt;
|
|
1164
1309
|
const onFirstFrame = (startedSpeakingAt) => {
|
|
1310
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1165
1311
|
this.agentSession._updateAgentState("speaking", {
|
|
1166
1312
|
startTime: startedSpeakingAt,
|
|
1167
1313
|
otelContext: speechHandle._agentTurnContext
|
|
1168
1314
|
});
|
|
1315
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1316
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1317
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1318
|
+
}
|
|
1169
1319
|
};
|
|
1170
1320
|
let audioOut = null;
|
|
1171
1321
|
if (audioOutput) {
|
|
@@ -1208,6 +1358,25 @@ ${instructions}`;
|
|
|
1208
1358
|
if (audioOutput) {
|
|
1209
1359
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1210
1360
|
}
|
|
1361
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1362
|
+
const assistantMetrics = {};
|
|
1363
|
+
if (llmGenData.ttft !== void 0) {
|
|
1364
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft;
|
|
1365
|
+
}
|
|
1366
|
+
if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
|
|
1367
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
|
|
1368
|
+
}
|
|
1369
|
+
if (agentStartedSpeakingAt !== void 0) {
|
|
1370
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
|
|
1371
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
|
|
1372
|
+
if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
|
|
1373
|
+
const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
|
|
1374
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1375
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1379
|
+
let hasSpeechMessage = false;
|
|
1211
1380
|
if (toolsMessages) {
|
|
1212
1381
|
for (const msg of toolsMessages) {
|
|
1213
1382
|
msg.createdAt = replyStartedAt;
|
|
@@ -1248,20 +1417,27 @@ ${instructions}`;
|
|
|
1248
1417
|
}
|
|
1249
1418
|
}
|
|
1250
1419
|
if (forwardedText) {
|
|
1420
|
+
hasSpeechMessage = true;
|
|
1251
1421
|
const message = import_chat_context.ChatMessage.create({
|
|
1252
1422
|
role: "assistant",
|
|
1253
1423
|
content: forwardedText,
|
|
1254
1424
|
id: llmGenData.id,
|
|
1255
1425
|
interrupted: true,
|
|
1256
|
-
createdAt: replyStartedAt
|
|
1426
|
+
createdAt: replyStartedAt,
|
|
1427
|
+
metrics: assistantMetrics
|
|
1257
1428
|
});
|
|
1258
1429
|
chatCtx.insert(message);
|
|
1259
1430
|
this.agent._chatCtx.insert(message);
|
|
1260
1431
|
speechHandle._itemAdded([message]);
|
|
1261
1432
|
this.agentSession._conversationItemAdded(message);
|
|
1433
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
1262
1434
|
}
|
|
1263
1435
|
if (this.agentSession.agentState === "speaking") {
|
|
1264
1436
|
this.agentSession._updateAgentState("listening");
|
|
1437
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1438
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1439
|
+
this.restoreInterruptionByAudioActivity();
|
|
1440
|
+
}
|
|
1265
1441
|
}
|
|
1266
1442
|
this.logger.info(
|
|
1267
1443
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
@@ -1272,17 +1448,20 @@ ${instructions}`;
|
|
|
1272
1448
|
return;
|
|
1273
1449
|
}
|
|
1274
1450
|
if (textOut && textOut.text) {
|
|
1451
|
+
hasSpeechMessage = true;
|
|
1275
1452
|
const message = import_chat_context.ChatMessage.create({
|
|
1276
1453
|
role: "assistant",
|
|
1277
1454
|
id: llmGenData.id,
|
|
1278
1455
|
interrupted: false,
|
|
1279
1456
|
createdAt: replyStartedAt,
|
|
1280
|
-
content: textOut.text
|
|
1457
|
+
content: textOut.text,
|
|
1458
|
+
metrics: assistantMetrics
|
|
1281
1459
|
});
|
|
1282
1460
|
chatCtx.insert(message);
|
|
1283
1461
|
this.agent._chatCtx.insert(message);
|
|
1284
1462
|
speechHandle._itemAdded([message]);
|
|
1285
1463
|
this.agentSession._conversationItemAdded(message);
|
|
1464
|
+
span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
1286
1465
|
this.logger.info(
|
|
1287
1466
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
1288
1467
|
"playout completed without interruption"
|
|
@@ -1292,6 +1471,12 @@ ${instructions}`;
|
|
|
1292
1471
|
this.agentSession._updateAgentState("thinking");
|
|
1293
1472
|
} else if (this.agentSession.agentState === "speaking") {
|
|
1294
1473
|
this.agentSession._updateAgentState("listening");
|
|
1474
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1475
|
+
{
|
|
1476
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1477
|
+
this.restoreInterruptionByAudioActivity();
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1295
1480
|
}
|
|
1296
1481
|
speechHandle._markGenerationDone();
|
|
1297
1482
|
await executeToolsTask.result;
|
|
@@ -1331,7 +1516,8 @@ ${instructions}`;
|
|
|
1331
1516
|
replyAbortController,
|
|
1332
1517
|
instructions,
|
|
1333
1518
|
void 0,
|
|
1334
|
-
toolMessages
|
|
1519
|
+
toolMessages,
|
|
1520
|
+
hasSpeechMessage ? void 0 : userMetrics
|
|
1335
1521
|
),
|
|
1336
1522
|
ownedSpeechHandle: speechHandle,
|
|
1337
1523
|
name: "AgentActivity.pipelineReply"
|
|
@@ -1351,7 +1537,7 @@ ${instructions}`;
|
|
|
1351
1537
|
}
|
|
1352
1538
|
}
|
|
1353
1539
|
};
|
|
1354
|
-
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => import_telemetry.tracer.startActiveSpan(
|
|
1540
|
+
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => import_telemetry.tracer.startActiveSpan(
|
|
1355
1541
|
async (span) => this._pipelineReplyTaskImpl({
|
|
1356
1542
|
speechHandle,
|
|
1357
1543
|
chatCtx,
|
|
@@ -1361,7 +1547,8 @@ ${instructions}`;
|
|
|
1361
1547
|
instructions,
|
|
1362
1548
|
newMessage,
|
|
1363
1549
|
toolsMessages,
|
|
1364
|
-
span
|
|
1550
|
+
span,
|
|
1551
|
+
_previousUserMetrics
|
|
1365
1552
|
}),
|
|
1366
1553
|
{
|
|
1367
1554
|
name: "agent_turn",
|
|
@@ -1427,6 +1614,7 @@ ${instructions}`;
|
|
|
1427
1614
|
});
|
|
1428
1615
|
};
|
|
1429
1616
|
const readMessages = async (abortController, outputs) => {
|
|
1617
|
+
var _a2, _b;
|
|
1430
1618
|
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
1431
1619
|
once: true
|
|
1432
1620
|
});
|
|
@@ -1473,7 +1661,9 @@ ${instructions}`;
|
|
|
1473
1661
|
(...args) => this.agent.ttsNode(...args),
|
|
1474
1662
|
ttsTextInput,
|
|
1475
1663
|
modelSettings,
|
|
1476
|
-
abortController
|
|
1664
|
+
abortController,
|
|
1665
|
+
(_a2 = this.tts) == null ? void 0 : _a2.model,
|
|
1666
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
1477
1667
|
);
|
|
1478
1668
|
tasks.push(ttsTask);
|
|
1479
1669
|
realtimeAudioResult = ttsGenData.audioStream;
|
|
@@ -1865,11 +2055,46 @@ ${instructions}`;
|
|
|
1865
2055
|
if (this._mainTask) {
|
|
1866
2056
|
await this._mainTask.cancelAndWait();
|
|
1867
2057
|
}
|
|
2058
|
+
if (this.interruptionDetector) {
|
|
2059
|
+
this.interruptionDetector.off(
|
|
2060
|
+
"user_overlapping_speech",
|
|
2061
|
+
this.onInterruptionOverlappingSpeech
|
|
2062
|
+
);
|
|
2063
|
+
this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2064
|
+
this.interruptionDetector.off("error", this.onInterruptionError);
|
|
2065
|
+
}
|
|
1868
2066
|
this.agent._agentActivity = void 0;
|
|
1869
2067
|
} finally {
|
|
1870
2068
|
unlock();
|
|
1871
2069
|
}
|
|
1872
2070
|
}
|
|
2071
|
+
resolveInterruptionDetector() {
|
|
2072
|
+
const interruptionDetection = this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
|
|
2073
|
+
if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof import_llm.RealtimeModel))) {
|
|
2074
|
+
if (interruptionDetection === "adaptive") {
|
|
2075
|
+
this.logger.warn(
|
|
2076
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
|
|
2077
|
+
);
|
|
2078
|
+
return void 0;
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
if (interruptionDetection !== void 0 && interruptionDetection === false || interruptionDetection === "vad") {
|
|
2082
|
+
return void 0;
|
|
2083
|
+
}
|
|
2084
|
+
try {
|
|
2085
|
+
const detector = new import_interruption_detector.AdaptiveInterruptionDetector();
|
|
2086
|
+
detector.on("user_overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2087
|
+
detector.on("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2088
|
+
detector.on("error", this.onInterruptionError);
|
|
2089
|
+
return detector;
|
|
2090
|
+
} catch (error) {
|
|
2091
|
+
this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
|
|
2092
|
+
}
|
|
2093
|
+
return void 0;
|
|
2094
|
+
}
|
|
2095
|
+
restoreInterruptionByAudioActivity() {
|
|
2096
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2097
|
+
}
|
|
1873
2098
|
async _closeSessionResources() {
|
|
1874
2099
|
var _a, _b, _c;
|
|
1875
2100
|
if (this.llm instanceof import_llm.LLM) {
|