@livekit/agents 1.0.48 → 1.1.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.cjs +27 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +9 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +18 -0
- package/dist/constants.js.map +1 -1
- package/dist/inference/api_protos.d.cts +71 -71
- package/dist/inference/api_protos.d.ts +71 -71
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +147 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +121 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +181 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +147 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +329 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +295 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +14 -10
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +2 -1
- package/dist/inference/llm.d.ts +2 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +8 -10
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +7 -2
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +2 -0
- package/dist/inference/stt.d.ts +2 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +8 -3
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +7 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -0
- package/dist/inference/tts.d.ts +2 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +8 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +13 -0
- package/dist/inference/utils.d.ts +13 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/llm/chat_context.cjs +20 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +19 -1
- package/dist/llm/chat_context.d.ts +19 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +20 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +16 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +9 -0
- package/dist/llm/llm.d.ts +9 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +16 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +45 -1
- package/dist/metrics/base.d.ts +45 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +3 -0
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +9 -0
- package/dist/metrics/usage_collector.d.ts +9 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +3 -0
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +9 -0
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +9 -0
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/multi_input_stream.test.cjs +4 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -1
- package/dist/stream/multi_input_stream.test.js +5 -1
- package/dist/stream/multi_input_stream.test.js.map +1 -1
- package/dist/stream/stream_channel.cjs +31 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +4 -2
- package/dist/stream/stream_channel.d.ts +4 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +31 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stt/stt.cjs +34 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +22 -0
- package/dist/stt/stt.d.ts +22 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +34 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/otel_http_exporter.cjs +24 -5
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
- package/dist/telemetry/otel_http_exporter.d.cts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
- package/dist/telemetry/otel_http_exporter.js +24 -5
- package/dist/telemetry/otel_http_exporter.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +5 -5
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +9 -5
- package/dist/telemetry/trace_types.d.ts +9 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +5 -5
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/telemetry/traces.cjs +47 -8
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +47 -8
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +64 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +34 -0
- package/dist/tts/tts.d.ts +34 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +64 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +25 -4
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +10 -2
- package/dist/voice/agent.d.ts +10 -2
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +25 -4
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +261 -36
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +20 -6
- package/dist/voice/agent_activity.d.ts +20 -6
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +262 -37
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -48
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +90 -20
- package/dist/voice/agent_session.d.ts +90 -20
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +105 -46
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +287 -6
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +42 -3
- package/dist/voice/audio_recognition.d.ts +42 -3
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +289 -7
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/client_events.cjs +554 -0
- package/dist/voice/client_events.cjs.map +1 -0
- package/dist/voice/client_events.d.cts +195 -0
- package/dist/voice/client_events.d.ts +195 -0
- package/dist/voice/client_events.d.ts.map +1 -0
- package/dist/voice/client_events.js +548 -0
- package/dist/voice/client_events.js.map +1 -0
- package/dist/voice/events.cjs +1 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +8 -5
- package/dist/voice/events.d.ts +8 -5
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +43 -8
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -3
- package/dist/voice/generation.d.ts +3 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/report.cjs +20 -8
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -0
- package/dist/voice/report.d.ts +5 -0
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +20 -8
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +106 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +105 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +5 -39
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +4 -9
- package/dist/voice/room_io/room_io.d.ts +4 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +5 -40
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +97 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +25 -0
- package/dist/voice/turn_config/utils.d.ts +25 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +73 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +86 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +85 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/wire_format.cjs +798 -0
- package/dist/voice/wire_format.cjs.map +1 -0
- package/dist/voice/wire_format.d.cts +5503 -0
- package/dist/voice/wire_format.d.ts +5503 -0
- package/dist/voice/wire_format.d.ts.map +1 -0
- package/dist/voice/wire_format.js +728 -0
- package/dist/voice/wire_format.js.map +1 -0
- package/package.json +2 -1
- package/src/constants.ts +13 -0
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +187 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +188 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +402 -0
- package/src/inference/llm.ts +9 -12
- package/src/inference/stt.ts +10 -3
- package/src/inference/tts.ts +10 -3
- package/src/inference/utils.ts +29 -1
- package/src/llm/chat_context.ts +40 -2
- package/src/llm/index.ts +1 -0
- package/src/llm/llm.ts +16 -0
- package/src/llm/realtime.ts +4 -0
- package/src/metrics/base.ts +48 -1
- package/src/metrics/index.ts +11 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +11 -0
- package/src/metrics/utils.ts +11 -0
- package/src/stream/multi_input_stream.test.ts +6 -1
- package/src/stream/stream_channel.ts +34 -2
- package/src/stt/stt.ts +38 -0
- package/src/telemetry/otel_http_exporter.ts +28 -5
- package/src/telemetry/trace_types.ts +11 -8
- package/src/telemetry/traces.ts +111 -54
- package/src/tts/tts.ts +69 -1
- package/src/voice/agent.ts +30 -3
- package/src/voice/agent_activity.ts +327 -28
- package/src/voice/agent_session.ts +207 -59
- package/src/voice/audio_recognition.ts +385 -9
- package/src/voice/client_events.ts +838 -0
- package/src/voice/events.ts +14 -4
- package/src/voice/generation.ts +52 -9
- package/src/voice/index.ts +1 -0
- package/src/voice/report.test.ts +117 -0
- package/src/voice/report.ts +29 -6
- package/src/voice/room_io/room_io.ts +7 -61
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +100 -0
- package/src/voice/turn_config/utils.ts +103 -0
- package/src/voice/wire_format.ts +827 -0
|
@@ -2,7 +2,8 @@ import { Mutex } from "@livekit/mutex";
|
|
|
2
2
|
import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
|
|
3
3
|
import { Heap } from "heap-js";
|
|
4
4
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
5
|
-
import { ReadableStream } from "node:stream/web";
|
|
5
|
+
import { ReadableStream, TransformStream } from "node:stream/web";
|
|
6
|
+
import { AdaptiveInterruptionDetector } from "../inference/interruption/interruption_detector.js";
|
|
6
7
|
import { ChatMessage } from "../llm/chat_context.js";
|
|
7
8
|
import {
|
|
8
9
|
LLM,
|
|
@@ -74,16 +75,34 @@ class AgentActivity {
|
|
|
74
75
|
// default to null as None, which maps to the default provider tool choice value
|
|
75
76
|
toolChoice = null;
|
|
76
77
|
_preemptiveGeneration;
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
_userTurnCompletedTask;
|
|
78
|
+
interruptionDetector;
|
|
79
|
+
isInterruptionDetectionEnabled;
|
|
80
|
+
isInterruptionByAudioActivityEnabled;
|
|
81
|
+
isDefaultInterruptionByAudioActivityEnabled;
|
|
82
82
|
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
83
83
|
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
84
84
|
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
85
85
|
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
86
86
|
onModelError = (ev) => this.onError(ev);
|
|
87
|
+
onInterruptionOverlappingSpeech = (ev) => {
|
|
88
|
+
this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
|
|
89
|
+
};
|
|
90
|
+
onInterruptionMetricsCollected = (ev) => {
|
|
91
|
+
this.agentSession.emit(
|
|
92
|
+
AgentSessionEventTypes.MetricsCollected,
|
|
93
|
+
createMetricsCollectedEvent({ metrics: ev })
|
|
94
|
+
);
|
|
95
|
+
};
|
|
96
|
+
onInterruptionError = (ev) => {
|
|
97
|
+
const errorEvent = createErrorEvent(ev, this.interruptionDetector);
|
|
98
|
+
this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
|
|
99
|
+
this.agentSession._onError(ev);
|
|
100
|
+
};
|
|
101
|
+
/** @internal */
|
|
102
|
+
_mainTask;
|
|
103
|
+
_onEnterTask;
|
|
104
|
+
_onExitTask;
|
|
105
|
+
_userTurnCompletedTask;
|
|
87
106
|
constructor(agent, agentSession) {
|
|
88
107
|
this.agent = agent;
|
|
89
108
|
this.agentSession = agentSession;
|
|
@@ -142,6 +161,10 @@ class AgentActivity {
|
|
|
142
161
|
"VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
|
|
143
162
|
);
|
|
144
163
|
}
|
|
164
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
165
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
166
|
+
this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
167
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
145
168
|
}
|
|
146
169
|
async start() {
|
|
147
170
|
const unlock = await this.lock.lock();
|
|
@@ -234,8 +257,9 @@ class AgentActivity {
|
|
|
234
257
|
vad: this.vad,
|
|
235
258
|
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
236
259
|
turnDetectionMode: this.turnDetectionMode,
|
|
237
|
-
|
|
238
|
-
|
|
260
|
+
interruptionDetection: this.interruptionDetector,
|
|
261
|
+
minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
|
|
262
|
+
maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
|
|
239
263
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
240
264
|
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
241
265
|
sttProvider: this.getSttProvider(),
|
|
@@ -297,7 +321,8 @@ class AgentActivity {
|
|
|
297
321
|
return this.realtimeSession;
|
|
298
322
|
}
|
|
299
323
|
get allowInterruptions() {
|
|
300
|
-
|
|
324
|
+
var _a;
|
|
325
|
+
return ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.mode) !== false;
|
|
301
326
|
}
|
|
302
327
|
get useTtsAlignedTranscript() {
|
|
303
328
|
return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
|
|
@@ -308,6 +333,11 @@ class AgentActivity {
|
|
|
308
333
|
get toolCtx() {
|
|
309
334
|
return this.agent.toolCtx;
|
|
310
335
|
}
|
|
336
|
+
/** @internal */
|
|
337
|
+
get inputStartedAt() {
|
|
338
|
+
var _a;
|
|
339
|
+
return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
|
|
340
|
+
}
|
|
311
341
|
async updateChatCtx(chatCtx) {
|
|
312
342
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
313
343
|
this.agent._chatCtx = chatCtx;
|
|
@@ -332,19 +362,40 @@ class AgentActivity {
|
|
|
332
362
|
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
333
363
|
}
|
|
334
364
|
}
|
|
335
|
-
updateOptions({
|
|
365
|
+
updateOptions({
|
|
366
|
+
toolChoice,
|
|
367
|
+
turnDetection
|
|
368
|
+
}) {
|
|
336
369
|
if (toolChoice !== void 0) {
|
|
337
370
|
this.toolChoice = toolChoice;
|
|
338
371
|
}
|
|
339
372
|
if (this.realtimeSession) {
|
|
340
373
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
341
374
|
}
|
|
375
|
+
if (turnDetection !== void 0) {
|
|
376
|
+
this.turnDetectionMode = turnDetection;
|
|
377
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
|
|
378
|
+
if (this.agentSession.agentState !== "speaking") {
|
|
379
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
if (this.audioRecognition) {
|
|
383
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
384
|
+
}
|
|
342
385
|
}
|
|
343
386
|
attachAudioInput(audioStream) {
|
|
344
387
|
void this.audioStream.close();
|
|
345
388
|
this.audioStream = new MultiInputStream();
|
|
389
|
+
const aecWarmupAudioFilter = new TransformStream({
|
|
390
|
+
transform: (frame, controller) => {
|
|
391
|
+
const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
|
|
392
|
+
if (!shouldDiscardForAecWarmup) {
|
|
393
|
+
controller.enqueue(frame);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
});
|
|
346
397
|
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
347
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
398
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
|
|
348
399
|
if (this.realtimeSession) {
|
|
349
400
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
350
401
|
}
|
|
@@ -450,6 +501,13 @@ class AgentActivity {
|
|
|
450
501
|
this.logger.info("onInputSpeechStarted");
|
|
451
502
|
if (!this.vad) {
|
|
452
503
|
this.agentSession._updateUserState("speaking");
|
|
504
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
505
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
506
|
+
0,
|
|
507
|
+
Date.now(),
|
|
508
|
+
this.agentSession._userSpeakingSpan
|
|
509
|
+
);
|
|
510
|
+
}
|
|
453
511
|
}
|
|
454
512
|
try {
|
|
455
513
|
this.interrupt();
|
|
@@ -463,6 +521,9 @@ class AgentActivity {
|
|
|
463
521
|
onInputSpeechStopped(ev) {
|
|
464
522
|
this.logger.info(ev, "onInputSpeechStopped");
|
|
465
523
|
if (!this.vad) {
|
|
524
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
525
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
526
|
+
}
|
|
466
527
|
this.agentSession._updateUserState("listening");
|
|
467
528
|
}
|
|
468
529
|
if (ev.userTranscriptionEnabled) {
|
|
@@ -524,48 +585,75 @@ class AgentActivity {
|
|
|
524
585
|
onStartOfSpeech(ev) {
|
|
525
586
|
let speechStartTime = Date.now();
|
|
526
587
|
if (ev) {
|
|
527
|
-
speechStartTime = speechStartTime - ev.speechDuration;
|
|
588
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
528
589
|
}
|
|
529
590
|
this.agentSession._updateUserState("speaking", speechStartTime);
|
|
591
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
592
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
593
|
+
ev.speechDuration,
|
|
594
|
+
speechStartTime,
|
|
595
|
+
this.agentSession._userSpeakingSpan
|
|
596
|
+
);
|
|
597
|
+
}
|
|
530
598
|
}
|
|
531
599
|
onEndOfSpeech(ev) {
|
|
532
600
|
let speechEndTime = Date.now();
|
|
533
601
|
if (ev) {
|
|
534
|
-
speechEndTime = speechEndTime - ev.silenceDuration;
|
|
602
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
603
|
+
}
|
|
604
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
605
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
606
|
+
speechEndTime,
|
|
607
|
+
this.agentSession._userSpeakingSpan
|
|
608
|
+
);
|
|
535
609
|
}
|
|
536
610
|
this.agentSession._updateUserState("listening", speechEndTime);
|
|
537
611
|
}
|
|
538
612
|
onVADInferenceDone(ev) {
|
|
613
|
+
var _a;
|
|
539
614
|
if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
|
|
540
615
|
return;
|
|
541
616
|
}
|
|
542
|
-
if (ev.speechDuration >= this.agentSession.options.
|
|
617
|
+
if (ev.speechDuration >= ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
|
|
543
618
|
this.interruptByAudioActivity();
|
|
544
619
|
}
|
|
545
620
|
}
|
|
546
621
|
interruptByAudioActivity() {
|
|
547
|
-
var _a, _b;
|
|
622
|
+
var _a, _b, _c, _d;
|
|
623
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
624
|
+
return;
|
|
625
|
+
}
|
|
626
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
627
|
+
return;
|
|
628
|
+
}
|
|
548
629
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
549
630
|
return;
|
|
550
631
|
}
|
|
551
|
-
if (this.stt && this.agentSession.options.
|
|
632
|
+
if (this.stt && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
|
|
552
633
|
const text = this.audioRecognition.currentTranscript;
|
|
553
634
|
const normalizedText = text ?? "";
|
|
554
635
|
const wordCount = splitWords(normalizedText, true).length;
|
|
555
|
-
if (wordCount < this.agentSession.options.
|
|
636
|
+
if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
556
637
|
return;
|
|
557
638
|
}
|
|
558
639
|
}
|
|
559
|
-
(
|
|
640
|
+
(_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
|
|
560
641
|
if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
|
|
561
642
|
this.logger.info(
|
|
562
643
|
{ "speech id": this._currentSpeech.id },
|
|
563
644
|
"speech interrupted by audio activity"
|
|
564
645
|
);
|
|
565
|
-
(
|
|
646
|
+
(_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
|
|
566
647
|
this._currentSpeech.interrupt();
|
|
567
648
|
}
|
|
568
649
|
}
|
|
650
|
+
onInterruption(ev) {
|
|
651
|
+
this.restoreInterruptionByAudioActivity();
|
|
652
|
+
this.interruptByAudioActivity();
|
|
653
|
+
if (this.audioRecognition) {
|
|
654
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
|
|
655
|
+
}
|
|
656
|
+
}
|
|
569
657
|
onInterimTranscript(ev) {
|
|
570
658
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
571
659
|
return;
|
|
@@ -614,7 +702,8 @@ class AgentActivity {
|
|
|
614
702
|
);
|
|
615
703
|
const userMessage = ChatMessage.create({
|
|
616
704
|
role: "user",
|
|
617
|
-
content: info.newTranscript
|
|
705
|
+
content: info.newTranscript,
|
|
706
|
+
transcriptConfidence: info.transcriptConfidence
|
|
618
707
|
});
|
|
619
708
|
const chatCtx = this.agent.chatCtx.copy();
|
|
620
709
|
const speechHandle = this.generateReply({
|
|
@@ -672,6 +761,7 @@ class AgentActivity {
|
|
|
672
761
|
return task;
|
|
673
762
|
}
|
|
674
763
|
async onEndOfTurn(info) {
|
|
764
|
+
var _a, _b;
|
|
675
765
|
if (this.schedulingPaused) {
|
|
676
766
|
this.cancelPreemptiveGeneration();
|
|
677
767
|
this.logger.warn(
|
|
@@ -680,14 +770,14 @@ class AgentActivity {
|
|
|
680
770
|
);
|
|
681
771
|
return true;
|
|
682
772
|
}
|
|
683
|
-
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.
|
|
773
|
+
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
|
|
684
774
|
const wordCount = splitWords(info.newTranscript, true).length;
|
|
685
|
-
if (wordCount < this.agentSession.options.
|
|
775
|
+
if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
|
|
686
776
|
this.cancelPreemptiveGeneration();
|
|
687
777
|
this.logger.info(
|
|
688
778
|
{
|
|
689
779
|
wordCount,
|
|
690
|
-
minInterruptionWords: this.agentSession.options.
|
|
780
|
+
minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords
|
|
691
781
|
},
|
|
692
782
|
"skipping user input, word count below minimum interruption threshold"
|
|
693
783
|
);
|
|
@@ -906,7 +996,8 @@ ${instructions}`;
|
|
|
906
996
|
}
|
|
907
997
|
let userMessage = ChatMessage.create({
|
|
908
998
|
role: "user",
|
|
909
|
-
content: info.newTranscript
|
|
999
|
+
content: info.newTranscript,
|
|
1000
|
+
transcriptConfidence: info.transcriptConfidence
|
|
910
1001
|
});
|
|
911
1002
|
const chatCtx = this.agent.chatCtx.copy();
|
|
912
1003
|
const startTime = Date.now();
|
|
@@ -924,11 +1015,32 @@ ${instructions}`;
|
|
|
924
1015
|
} else if (this.llm === void 0) {
|
|
925
1016
|
return;
|
|
926
1017
|
}
|
|
1018
|
+
const userMetricsReport = {};
|
|
1019
|
+
if (info.startedSpeakingAt !== void 0) {
|
|
1020
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
|
|
1021
|
+
}
|
|
1022
|
+
if (info.stoppedSpeakingAt !== void 0) {
|
|
1023
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
|
|
1024
|
+
}
|
|
1025
|
+
if (info.transcriptionDelay !== void 0) {
|
|
1026
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
|
|
1027
|
+
}
|
|
1028
|
+
if (info.endOfUtteranceDelay !== void 0) {
|
|
1029
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
|
|
1030
|
+
}
|
|
1031
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
|
|
1032
|
+
if (userMessage) {
|
|
1033
|
+
userMessage.metrics = userMetricsReport;
|
|
1034
|
+
}
|
|
927
1035
|
let speechHandle;
|
|
928
1036
|
if (this._preemptiveGeneration !== void 0) {
|
|
929
1037
|
const preemptive = this._preemptiveGeneration;
|
|
930
1038
|
if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && isSameToolContext(preemptive.tools, this.tools) && isSameToolChoice(preemptive.toolChoice, this.toolChoice)) {
|
|
931
1039
|
speechHandle = preemptive.speechHandle;
|
|
1040
|
+
if (preemptive.userMessage && userMessage) {
|
|
1041
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1042
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1043
|
+
}
|
|
932
1044
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
933
1045
|
this.logger.debug(
|
|
934
1046
|
{
|
|
@@ -962,6 +1074,7 @@ ${instructions}`;
|
|
|
962
1074
|
);
|
|
963
1075
|
}
|
|
964
1076
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
1077
|
+
var _a, _b;
|
|
965
1078
|
speechHandle._agentTurnContext = otelContext.active();
|
|
966
1079
|
speechHandleStorage.enterWith(speechHandle);
|
|
967
1080
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
@@ -994,11 +1107,18 @@ ${instructions}`;
|
|
|
994
1107
|
textOut = _textOut;
|
|
995
1108
|
tasks.push(textForwardTask);
|
|
996
1109
|
}
|
|
1110
|
+
let replyStartedSpeakingAt;
|
|
1111
|
+
let replyTtsGenData = null;
|
|
997
1112
|
const onFirstFrame = (startedSpeakingAt) => {
|
|
1113
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
998
1114
|
this.agentSession._updateAgentState("speaking", {
|
|
999
1115
|
startTime: startedSpeakingAt,
|
|
1000
1116
|
otelContext: speechHandle._agentTurnContext
|
|
1001
1117
|
});
|
|
1118
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1119
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1120
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1121
|
+
}
|
|
1002
1122
|
};
|
|
1003
1123
|
if (!audioOutput) {
|
|
1004
1124
|
if (textOut) {
|
|
@@ -1011,9 +1131,12 @@ ${instructions}`;
|
|
|
1011
1131
|
(...args) => this.agent.ttsNode(...args),
|
|
1012
1132
|
audioSource,
|
|
1013
1133
|
modelSettings,
|
|
1014
|
-
replyAbortController
|
|
1134
|
+
replyAbortController,
|
|
1135
|
+
(_a = this.tts) == null ? void 0 : _a.model,
|
|
1136
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
1015
1137
|
);
|
|
1016
1138
|
tasks.push(ttsTask);
|
|
1139
|
+
replyTtsGenData = ttsGenData;
|
|
1017
1140
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1018
1141
|
ttsGenData.audioStream,
|
|
1019
1142
|
audioOutput,
|
|
@@ -1045,16 +1168,30 @@ ${instructions}`;
|
|
|
1045
1168
|
}
|
|
1046
1169
|
}
|
|
1047
1170
|
if (addToChatCtx) {
|
|
1171
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1172
|
+
const replyAssistantMetrics = {};
|
|
1173
|
+
if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
|
|
1174
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1175
|
+
}
|
|
1176
|
+
if (replyStartedSpeakingAt !== void 0) {
|
|
1177
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
|
|
1178
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
|
|
1179
|
+
}
|
|
1048
1180
|
const message = ChatMessage.create({
|
|
1049
1181
|
role: "assistant",
|
|
1050
1182
|
content: (textOut == null ? void 0 : textOut.text) || "",
|
|
1051
|
-
interrupted: speechHandle.interrupted
|
|
1183
|
+
interrupted: speechHandle.interrupted,
|
|
1184
|
+
metrics: replyAssistantMetrics
|
|
1052
1185
|
});
|
|
1053
1186
|
this.agent._chatCtx.insert(message);
|
|
1054
1187
|
this.agentSession._conversationItemAdded(message);
|
|
1055
1188
|
}
|
|
1056
1189
|
if (this.agentSession.agentState === "speaking") {
|
|
1057
1190
|
this.agentSession._updateAgentState("listening");
|
|
1191
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1192
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1193
|
+
}
|
|
1194
|
+
this.restoreInterruptionByAudioActivity();
|
|
1058
1195
|
}
|
|
1059
1196
|
}
|
|
1060
1197
|
_pipelineReplyTaskImpl = async ({
|
|
@@ -1066,9 +1203,10 @@ ${instructions}`;
|
|
|
1066
1203
|
instructions,
|
|
1067
1204
|
newMessage,
|
|
1068
1205
|
toolsMessages,
|
|
1069
|
-
span
|
|
1206
|
+
span,
|
|
1207
|
+
_previousUserMetrics
|
|
1070
1208
|
}) => {
|
|
1071
|
-
var _a, _b;
|
|
1209
|
+
var _a, _b, _c, _d, _e, _f;
|
|
1072
1210
|
speechHandle._agentTurnContext = otelContext.active();
|
|
1073
1211
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1074
1212
|
if (instructions) {
|
|
@@ -1106,7 +1244,9 @@ ${instructions}`;
|
|
|
1106
1244
|
chatCtx,
|
|
1107
1245
|
toolCtx,
|
|
1108
1246
|
modelSettings,
|
|
1109
|
-
replyAbortController
|
|
1247
|
+
replyAbortController,
|
|
1248
|
+
(_b = this.llm) == null ? void 0 : _b.model,
|
|
1249
|
+
(_c = this.llm) == null ? void 0 : _c.provider
|
|
1110
1250
|
);
|
|
1111
1251
|
tasks.push(llmTask);
|
|
1112
1252
|
let ttsTask = null;
|
|
@@ -1119,16 +1259,20 @@ ${instructions}`;
|
|
|
1119
1259
|
(...args) => this.agent.ttsNode(...args),
|
|
1120
1260
|
ttsTextInput,
|
|
1121
1261
|
modelSettings,
|
|
1122
|
-
replyAbortController
|
|
1262
|
+
replyAbortController,
|
|
1263
|
+
(_d = this.tts) == null ? void 0 : _d.model,
|
|
1264
|
+
(_e = this.tts) == null ? void 0 : _e.provider
|
|
1123
1265
|
);
|
|
1124
1266
|
tasks.push(ttsTask);
|
|
1125
1267
|
} else {
|
|
1126
1268
|
llmOutput = llmGenData.textStream;
|
|
1127
1269
|
}
|
|
1128
1270
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1271
|
+
let userMetrics = _previousUserMetrics;
|
|
1129
1272
|
if (newMessage && speechHandle.scheduled) {
|
|
1130
1273
|
this.agent._chatCtx.insert(newMessage);
|
|
1131
1274
|
this.agentSession._conversationItemAdded(newMessage);
|
|
1275
|
+
userMetrics = newMessage.metrics;
|
|
1132
1276
|
}
|
|
1133
1277
|
if (speechHandle.interrupted) {
|
|
1134
1278
|
replyAbortController.abort();
|
|
@@ -1140,7 +1284,7 @@ ${instructions}`;
|
|
|
1140
1284
|
speechHandle._clearAuthorization();
|
|
1141
1285
|
const replyStartedAt = Date.now();
|
|
1142
1286
|
let transcriptionInput = llmOutput;
|
|
1143
|
-
if (this.useTtsAlignedTranscript && ((
|
|
1287
|
+
if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
|
|
1144
1288
|
const timedTextsStream = await Promise.race([
|
|
1145
1289
|
ttsGenData.timedTextsFut.await,
|
|
1146
1290
|
(ttsTask == null ? void 0 : ttsTask.result.catch(
|
|
@@ -1163,11 +1307,17 @@ ${instructions}`;
|
|
|
1163
1307
|
tasks.push(textForwardTask);
|
|
1164
1308
|
textOut = _textOut;
|
|
1165
1309
|
}
|
|
1310
|
+
let agentStartedSpeakingAt;
|
|
1166
1311
|
const onFirstFrame = (startedSpeakingAt) => {
|
|
1312
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1167
1313
|
this.agentSession._updateAgentState("speaking", {
|
|
1168
1314
|
startTime: startedSpeakingAt,
|
|
1169
1315
|
otelContext: speechHandle._agentTurnContext
|
|
1170
1316
|
});
|
|
1317
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1318
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1319
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1320
|
+
}
|
|
1171
1321
|
};
|
|
1172
1322
|
let audioOut = null;
|
|
1173
1323
|
if (audioOutput) {
|
|
@@ -1210,6 +1360,25 @@ ${instructions}`;
|
|
|
1210
1360
|
if (audioOutput) {
|
|
1211
1361
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1212
1362
|
}
|
|
1363
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1364
|
+
const assistantMetrics = {};
|
|
1365
|
+
if (llmGenData.ttft !== void 0) {
|
|
1366
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft;
|
|
1367
|
+
}
|
|
1368
|
+
if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
|
|
1369
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
|
|
1370
|
+
}
|
|
1371
|
+
if (agentStartedSpeakingAt !== void 0) {
|
|
1372
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
|
|
1373
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
|
|
1374
|
+
if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
|
|
1375
|
+
const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
|
|
1376
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1377
|
+
span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1378
|
+
}
|
|
1379
|
+
}
|
|
1380
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1381
|
+
let hasSpeechMessage = false;
|
|
1213
1382
|
if (toolsMessages) {
|
|
1214
1383
|
for (const msg of toolsMessages) {
|
|
1215
1384
|
msg.createdAt = replyStartedAt;
|
|
@@ -1250,20 +1419,27 @@ ${instructions}`;
|
|
|
1250
1419
|
}
|
|
1251
1420
|
}
|
|
1252
1421
|
if (forwardedText) {
|
|
1422
|
+
hasSpeechMessage = true;
|
|
1253
1423
|
const message = ChatMessage.create({
|
|
1254
1424
|
role: "assistant",
|
|
1255
1425
|
content: forwardedText,
|
|
1256
1426
|
id: llmGenData.id,
|
|
1257
1427
|
interrupted: true,
|
|
1258
|
-
createdAt: replyStartedAt
|
|
1428
|
+
createdAt: replyStartedAt,
|
|
1429
|
+
metrics: assistantMetrics
|
|
1259
1430
|
});
|
|
1260
1431
|
chatCtx.insert(message);
|
|
1261
1432
|
this.agent._chatCtx.insert(message);
|
|
1262
1433
|
speechHandle._itemAdded([message]);
|
|
1263
1434
|
this.agentSession._conversationItemAdded(message);
|
|
1435
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
1264
1436
|
}
|
|
1265
1437
|
if (this.agentSession.agentState === "speaking") {
|
|
1266
1438
|
this.agentSession._updateAgentState("listening");
|
|
1439
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1440
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1441
|
+
this.restoreInterruptionByAudioActivity();
|
|
1442
|
+
}
|
|
1267
1443
|
}
|
|
1268
1444
|
this.logger.info(
|
|
1269
1445
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
@@ -1274,17 +1450,20 @@ ${instructions}`;
|
|
|
1274
1450
|
return;
|
|
1275
1451
|
}
|
|
1276
1452
|
if (textOut && textOut.text) {
|
|
1453
|
+
hasSpeechMessage = true;
|
|
1277
1454
|
const message = ChatMessage.create({
|
|
1278
1455
|
role: "assistant",
|
|
1279
1456
|
id: llmGenData.id,
|
|
1280
1457
|
interrupted: false,
|
|
1281
1458
|
createdAt: replyStartedAt,
|
|
1282
|
-
content: textOut.text
|
|
1459
|
+
content: textOut.text,
|
|
1460
|
+
metrics: assistantMetrics
|
|
1283
1461
|
});
|
|
1284
1462
|
chatCtx.insert(message);
|
|
1285
1463
|
this.agent._chatCtx.insert(message);
|
|
1286
1464
|
speechHandle._itemAdded([message]);
|
|
1287
1465
|
this.agentSession._conversationItemAdded(message);
|
|
1466
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
1288
1467
|
this.logger.info(
|
|
1289
1468
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
1290
1469
|
"playout completed without interruption"
|
|
@@ -1294,6 +1473,12 @@ ${instructions}`;
|
|
|
1294
1473
|
this.agentSession._updateAgentState("thinking");
|
|
1295
1474
|
} else if (this.agentSession.agentState === "speaking") {
|
|
1296
1475
|
this.agentSession._updateAgentState("listening");
|
|
1476
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1477
|
+
{
|
|
1478
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1479
|
+
this.restoreInterruptionByAudioActivity();
|
|
1480
|
+
}
|
|
1481
|
+
}
|
|
1297
1482
|
}
|
|
1298
1483
|
speechHandle._markGenerationDone();
|
|
1299
1484
|
await executeToolsTask.result;
|
|
@@ -1333,7 +1518,8 @@ ${instructions}`;
|
|
|
1333
1518
|
replyAbortController,
|
|
1334
1519
|
instructions,
|
|
1335
1520
|
void 0,
|
|
1336
|
-
toolMessages
|
|
1521
|
+
toolMessages,
|
|
1522
|
+
hasSpeechMessage ? void 0 : userMetrics
|
|
1337
1523
|
),
|
|
1338
1524
|
ownedSpeechHandle: speechHandle,
|
|
1339
1525
|
name: "AgentActivity.pipelineReply"
|
|
@@ -1353,7 +1539,7 @@ ${instructions}`;
|
|
|
1353
1539
|
}
|
|
1354
1540
|
}
|
|
1355
1541
|
};
|
|
1356
|
-
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => tracer.startActiveSpan(
|
|
1542
|
+
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => tracer.startActiveSpan(
|
|
1357
1543
|
async (span) => this._pipelineReplyTaskImpl({
|
|
1358
1544
|
speechHandle,
|
|
1359
1545
|
chatCtx,
|
|
@@ -1363,7 +1549,8 @@ ${instructions}`;
|
|
|
1363
1549
|
instructions,
|
|
1364
1550
|
newMessage,
|
|
1365
1551
|
toolsMessages,
|
|
1366
|
-
span
|
|
1552
|
+
span,
|
|
1553
|
+
_previousUserMetrics
|
|
1367
1554
|
}),
|
|
1368
1555
|
{
|
|
1369
1556
|
name: "agent_turn",
|
|
@@ -1429,6 +1616,7 @@ ${instructions}`;
|
|
|
1429
1616
|
});
|
|
1430
1617
|
};
|
|
1431
1618
|
const readMessages = async (abortController, outputs) => {
|
|
1619
|
+
var _a2, _b;
|
|
1432
1620
|
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
1433
1621
|
once: true
|
|
1434
1622
|
});
|
|
@@ -1475,7 +1663,9 @@ ${instructions}`;
|
|
|
1475
1663
|
(...args) => this.agent.ttsNode(...args),
|
|
1476
1664
|
ttsTextInput,
|
|
1477
1665
|
modelSettings,
|
|
1478
|
-
abortController
|
|
1666
|
+
abortController,
|
|
1667
|
+
(_a2 = this.tts) == null ? void 0 : _a2.model,
|
|
1668
|
+
(_b = this.tts) == null ? void 0 : _b.provider
|
|
1479
1669
|
);
|
|
1480
1670
|
tasks.push(ttsTask);
|
|
1481
1671
|
realtimeAudioResult = ttsGenData.audioStream;
|
|
@@ -1867,11 +2057,46 @@ ${instructions}`;
|
|
|
1867
2057
|
if (this._mainTask) {
|
|
1868
2058
|
await this._mainTask.cancelAndWait();
|
|
1869
2059
|
}
|
|
2060
|
+
if (this.interruptionDetector) {
|
|
2061
|
+
this.interruptionDetector.off(
|
|
2062
|
+
"user_overlapping_speech",
|
|
2063
|
+
this.onInterruptionOverlappingSpeech
|
|
2064
|
+
);
|
|
2065
|
+
this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2066
|
+
this.interruptionDetector.off("error", this.onInterruptionError);
|
|
2067
|
+
}
|
|
1870
2068
|
this.agent._agentActivity = void 0;
|
|
1871
2069
|
} finally {
|
|
1872
2070
|
unlock();
|
|
1873
2071
|
}
|
|
1874
2072
|
}
|
|
2073
|
+
resolveInterruptionDetector() {
|
|
2074
|
+
const interruptionDetection = this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
|
|
2075
|
+
if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof RealtimeModel))) {
|
|
2076
|
+
if (interruptionDetection === "adaptive") {
|
|
2077
|
+
this.logger.warn(
|
|
2078
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
|
|
2079
|
+
);
|
|
2080
|
+
return void 0;
|
|
2081
|
+
}
|
|
2082
|
+
}
|
|
2083
|
+
if (interruptionDetection !== void 0 && interruptionDetection === false || interruptionDetection === "vad") {
|
|
2084
|
+
return void 0;
|
|
2085
|
+
}
|
|
2086
|
+
try {
|
|
2087
|
+
const detector = new AdaptiveInterruptionDetector();
|
|
2088
|
+
detector.on("user_overlapping_speech", this.onInterruptionOverlappingSpeech);
|
|
2089
|
+
detector.on("metrics_collected", this.onInterruptionMetricsCollected);
|
|
2090
|
+
detector.on("error", this.onInterruptionError);
|
|
2091
|
+
return detector;
|
|
2092
|
+
} catch (error) {
|
|
2093
|
+
this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
|
|
2094
|
+
}
|
|
2095
|
+
return void 0;
|
|
2096
|
+
}
|
|
2097
|
+
restoreInterruptionByAudioActivity() {
|
|
2098
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2099
|
+
}
|
|
1875
2100
|
async _closeSessionResources() {
|
|
1876
2101
|
var _a, _b, _c;
|
|
1877
2102
|
if (this.llm instanceof LLM) {
|