@livekit/agents 1.0.48 → 1.1.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.cjs +27 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +9 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +18 -0
- package/dist/constants.js.map +1 -1
- package/dist/inference/api_protos.d.cts +71 -71
- package/dist/inference/api_protos.d.ts +71 -71
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +147 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +121 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +181 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +147 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +329 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +295 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +14 -10
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +2 -1
- package/dist/inference/llm.d.ts +2 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +8 -10
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +7 -2
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +2 -0
- package/dist/inference/stt.d.ts +2 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +8 -3
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +7 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -0
- package/dist/inference/tts.d.ts +2 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +8 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +13 -0
- package/dist/inference/utils.d.ts +13 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/llm/chat_context.cjs +20 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +19 -1
- package/dist/llm/chat_context.d.ts +19 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +20 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +16 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +9 -0
- package/dist/llm/llm.d.ts +9 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +16 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +45 -1
- package/dist/metrics/base.d.ts +45 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +3 -0
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +9 -0
- package/dist/metrics/usage_collector.d.ts +9 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +3 -0
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +9 -0
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +9 -0
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/multi_input_stream.test.cjs +4 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -1
- package/dist/stream/multi_input_stream.test.js +5 -1
- package/dist/stream/multi_input_stream.test.js.map +1 -1
- package/dist/stream/stream_channel.cjs +31 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +4 -2
- package/dist/stream/stream_channel.d.ts +4 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +31 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stt/stt.cjs +34 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +22 -0
- package/dist/stt/stt.d.ts +22 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +34 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/otel_http_exporter.cjs +24 -5
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
- package/dist/telemetry/otel_http_exporter.d.cts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
- package/dist/telemetry/otel_http_exporter.js +24 -5
- package/dist/telemetry/otel_http_exporter.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +5 -5
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +9 -5
- package/dist/telemetry/trace_types.d.ts +9 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +5 -5
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/telemetry/traces.cjs +47 -8
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +47 -8
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +64 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +34 -0
- package/dist/tts/tts.d.ts +34 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +64 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +25 -4
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +10 -2
- package/dist/voice/agent.d.ts +10 -2
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +25 -4
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +261 -36
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +20 -6
- package/dist/voice/agent_activity.d.ts +20 -6
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +262 -37
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -48
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +90 -20
- package/dist/voice/agent_session.d.ts +90 -20
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +105 -46
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +287 -6
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +42 -3
- package/dist/voice/audio_recognition.d.ts +42 -3
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +289 -7
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/client_events.cjs +554 -0
- package/dist/voice/client_events.cjs.map +1 -0
- package/dist/voice/client_events.d.cts +195 -0
- package/dist/voice/client_events.d.ts +195 -0
- package/dist/voice/client_events.d.ts.map +1 -0
- package/dist/voice/client_events.js +548 -0
- package/dist/voice/client_events.js.map +1 -0
- package/dist/voice/events.cjs +1 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +8 -5
- package/dist/voice/events.d.ts +8 -5
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +43 -8
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -3
- package/dist/voice/generation.d.ts +3 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/report.cjs +20 -8
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -0
- package/dist/voice/report.d.ts +5 -0
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +20 -8
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +106 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +105 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +5 -39
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +4 -9
- package/dist/voice/room_io/room_io.d.ts +4 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +5 -40
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +97 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +25 -0
- package/dist/voice/turn_config/utils.d.ts +25 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +73 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +86 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +85 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/wire_format.cjs +798 -0
- package/dist/voice/wire_format.cjs.map +1 -0
- package/dist/voice/wire_format.d.cts +5503 -0
- package/dist/voice/wire_format.d.ts +5503 -0
- package/dist/voice/wire_format.d.ts.map +1 -0
- package/dist/voice/wire_format.js +728 -0
- package/dist/voice/wire_format.js.map +1 -0
- package/package.json +2 -1
- package/src/constants.ts +13 -0
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +187 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +188 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +402 -0
- package/src/inference/llm.ts +9 -12
- package/src/inference/stt.ts +10 -3
- package/src/inference/tts.ts +10 -3
- package/src/inference/utils.ts +29 -1
- package/src/llm/chat_context.ts +40 -2
- package/src/llm/index.ts +1 -0
- package/src/llm/llm.ts +16 -0
- package/src/llm/realtime.ts +4 -0
- package/src/metrics/base.ts +48 -1
- package/src/metrics/index.ts +11 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +11 -0
- package/src/metrics/utils.ts +11 -0
- package/src/stream/multi_input_stream.test.ts +6 -1
- package/src/stream/stream_channel.ts +34 -2
- package/src/stt/stt.ts +38 -0
- package/src/telemetry/otel_http_exporter.ts +28 -5
- package/src/telemetry/trace_types.ts +11 -8
- package/src/telemetry/traces.ts +111 -54
- package/src/tts/tts.ts +69 -1
- package/src/voice/agent.ts +30 -3
- package/src/voice/agent_activity.ts +327 -28
- package/src/voice/agent_session.ts +207 -59
- package/src/voice/audio_recognition.ts +385 -9
- package/src/voice/client_events.ts +838 -0
- package/src/voice/events.ts +14 -4
- package/src/voice/generation.ts +52 -9
- package/src/voice/index.ts +1 -0
- package/src/voice/report.test.ts +117 -0
- package/src/voice/report.ts +29 -6
- package/src/voice/room_io/room_io.ts +7 -61
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +100 -0
- package/src/voice/turn_config/utils.ts +103 -0
- package/src/voice/wire_format.ts +827 -0
|
@@ -7,8 +7,11 @@ import type { Span } from '@opentelemetry/api';
|
|
|
7
7
|
import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
|
|
8
8
|
import { Heap } from 'heap-js';
|
|
9
9
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
10
|
-
import { ReadableStream } from 'node:stream/web';
|
|
11
|
-
import
|
|
10
|
+
import { ReadableStream, TransformStream } from 'node:stream/web';
|
|
11
|
+
import type { InterruptionDetectionError } from '../inference/interruption/errors.js';
|
|
12
|
+
import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
|
|
13
|
+
import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
|
|
14
|
+
import { type ChatContext, ChatMessage, type MetricsReport } from '../llm/chat_context.js';
|
|
12
15
|
import {
|
|
13
16
|
type ChatItem,
|
|
14
17
|
type FunctionCall,
|
|
@@ -30,6 +33,7 @@ import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
|
|
|
30
33
|
import { log } from '../log.js';
|
|
31
34
|
import type {
|
|
32
35
|
EOUMetrics,
|
|
36
|
+
InterruptionMetrics,
|
|
33
37
|
LLMMetrics,
|
|
34
38
|
RealtimeModelMetrics,
|
|
35
39
|
STTMetrics,
|
|
@@ -57,7 +61,6 @@ import {
|
|
|
57
61
|
type EndOfTurnInfo,
|
|
58
62
|
type PreemptiveGenerationInfo,
|
|
59
63
|
type RecognitionHooks,
|
|
60
|
-
type _TurnDetector,
|
|
61
64
|
} from './audio_recognition.js';
|
|
62
65
|
import {
|
|
63
66
|
AgentSessionEventTypes,
|
|
@@ -101,6 +104,7 @@ interface PreemptiveGeneration {
|
|
|
101
104
|
createdAt: number;
|
|
102
105
|
}
|
|
103
106
|
|
|
107
|
+
// TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes
|
|
104
108
|
export class AgentActivity implements RecognitionHooks {
|
|
105
109
|
agent: Agent;
|
|
106
110
|
agentSession: AgentSession;
|
|
@@ -111,7 +115,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
111
115
|
private audioRecognition?: AudioRecognition;
|
|
112
116
|
private realtimeSession?: RealtimeSession;
|
|
113
117
|
private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
|
|
114
|
-
private turnDetectionMode?:
|
|
118
|
+
private turnDetectionMode?: TurnDetectionMode;
|
|
115
119
|
private logger = log();
|
|
116
120
|
private _schedulingPaused = true;
|
|
117
121
|
private _drainBlockedTasks: Task<any>[] = [];
|
|
@@ -126,6 +130,43 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
126
130
|
// default to null as None, which maps to the default provider tool choice value
|
|
127
131
|
private toolChoice: ToolChoice | null = null;
|
|
128
132
|
private _preemptiveGeneration?: PreemptiveGeneration;
|
|
133
|
+
private interruptionDetector?: AdaptiveInterruptionDetector;
|
|
134
|
+
private isInterruptionDetectionEnabled: boolean;
|
|
135
|
+
private isInterruptionByAudioActivityEnabled: boolean;
|
|
136
|
+
private isDefaultInterruptionByAudioActivityEnabled: boolean;
|
|
137
|
+
|
|
138
|
+
private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent): void =>
|
|
139
|
+
this.onGenerationCreated(ev);
|
|
140
|
+
|
|
141
|
+
private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent): void =>
|
|
142
|
+
this.onInputSpeechStarted(ev);
|
|
143
|
+
|
|
144
|
+
private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent): void =>
|
|
145
|
+
this.onInputSpeechStopped(ev);
|
|
146
|
+
|
|
147
|
+
private readonly onRealtimeInputAudioTranscriptionCompleted = (
|
|
148
|
+
ev: InputTranscriptionCompleted,
|
|
149
|
+
): void => this.onInputAudioTranscriptionCompleted(ev);
|
|
150
|
+
|
|
151
|
+
private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError): void =>
|
|
152
|
+
this.onError(ev);
|
|
153
|
+
|
|
154
|
+
private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
|
|
155
|
+
this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
|
|
159
|
+
this.agentSession.emit(
|
|
160
|
+
AgentSessionEventTypes.MetricsCollected,
|
|
161
|
+
createMetricsCollectedEvent({ metrics: ev }),
|
|
162
|
+
);
|
|
163
|
+
};
|
|
164
|
+
|
|
165
|
+
private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
|
|
166
|
+
const errorEvent = createErrorEvent(ev, this.interruptionDetector);
|
|
167
|
+
this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
|
|
168
|
+
this.agentSession._onError(ev);
|
|
169
|
+
};
|
|
129
170
|
|
|
130
171
|
/** @internal */
|
|
131
172
|
_mainTask?: Task<void>;
|
|
@@ -133,16 +174,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
133
174
|
_onExitTask?: Task<void>;
|
|
134
175
|
_userTurnCompletedTask?: Task<void>;
|
|
135
176
|
|
|
136
|
-
private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
|
|
137
|
-
this.onGenerationCreated(ev);
|
|
138
|
-
private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
|
|
139
|
-
this.onInputSpeechStarted(ev);
|
|
140
|
-
private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
|
|
141
|
-
this.onInputSpeechStopped(ev);
|
|
142
|
-
private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
|
|
143
|
-
this.onInputAudioTranscriptionCompleted(ev);
|
|
144
|
-
private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
|
|
145
|
-
this.onError(ev);
|
|
146
177
|
constructor(agent: Agent, agentSession: AgentSession) {
|
|
147
178
|
this.agent = agent;
|
|
148
179
|
this.agentSession = agentSession;
|
|
@@ -235,6 +266,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
235
266
|
'for more responsive interruption handling.',
|
|
236
267
|
);
|
|
237
268
|
}
|
|
269
|
+
|
|
270
|
+
this.interruptionDetector = this.resolveInterruptionDetector();
|
|
271
|
+
this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
|
|
272
|
+
|
|
273
|
+
// this allows taking over audio interruption temporarily until interruption is detected
|
|
274
|
+
// by default is is ture unless turnDetection is manual or realtime_llm
|
|
275
|
+
this.isInterruptionByAudioActivityEnabled =
|
|
276
|
+
this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
|
|
277
|
+
|
|
278
|
+
this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
|
|
238
279
|
}
|
|
239
280
|
|
|
240
281
|
async start(): Promise<void> {
|
|
@@ -348,8 +389,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
348
389
|
vad: this.vad,
|
|
349
390
|
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
350
391
|
turnDetectionMode: this.turnDetectionMode,
|
|
351
|
-
|
|
352
|
-
|
|
392
|
+
interruptionDetection: this.interruptionDetector,
|
|
393
|
+
minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
|
|
394
|
+
maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
|
|
353
395
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
354
396
|
sttModel: this.stt?.label,
|
|
355
397
|
sttProvider: this.getSttProvider(),
|
|
@@ -423,7 +465,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
423
465
|
|
|
424
466
|
get allowInterruptions(): boolean {
|
|
425
467
|
// TODO(AJS-51): Allow options to be defined in Agent class
|
|
426
|
-
return this.agentSession.options.
|
|
468
|
+
return this.agentSession.options.turnHandling.interruption?.mode !== false;
|
|
427
469
|
}
|
|
428
470
|
|
|
429
471
|
get useTtsAlignedTranscript(): boolean {
|
|
@@ -440,6 +482,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
440
482
|
return this.agent.toolCtx;
|
|
441
483
|
}
|
|
442
484
|
|
|
485
|
+
/** @internal */
|
|
486
|
+
get inputStartedAt() {
|
|
487
|
+
return this.audioRecognition?.inputStartedAt;
|
|
488
|
+
}
|
|
489
|
+
|
|
443
490
|
async updateChatCtx(chatCtx: ChatContext): Promise<void> {
|
|
444
491
|
chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
|
|
445
492
|
|
|
@@ -471,7 +518,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
471
518
|
}
|
|
472
519
|
}
|
|
473
520
|
|
|
474
|
-
updateOptions({
|
|
521
|
+
updateOptions({
|
|
522
|
+
toolChoice,
|
|
523
|
+
turnDetection,
|
|
524
|
+
}: {
|
|
525
|
+
toolChoice?: ToolChoice | null;
|
|
526
|
+
turnDetection?: TurnDetectionMode;
|
|
527
|
+
}): void {
|
|
475
528
|
if (toolChoice !== undefined) {
|
|
476
529
|
this.toolChoice = toolChoice;
|
|
477
530
|
}
|
|
@@ -479,14 +532,46 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
479
532
|
if (this.realtimeSession) {
|
|
480
533
|
this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
|
|
481
534
|
}
|
|
535
|
+
|
|
536
|
+
if (turnDetection !== undefined) {
|
|
537
|
+
this.turnDetectionMode = turnDetection;
|
|
538
|
+
this.isDefaultInterruptionByAudioActivityEnabled =
|
|
539
|
+
this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
|
|
540
|
+
|
|
541
|
+
// sync live flag immediately when not speaking so the change takes effect right away
|
|
542
|
+
if (this.agentSession.agentState !== 'speaking') {
|
|
543
|
+
this.isInterruptionByAudioActivityEnabled =
|
|
544
|
+
this.isDefaultInterruptionByAudioActivityEnabled;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
if (this.audioRecognition) {
|
|
549
|
+
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
|
|
550
|
+
}
|
|
482
551
|
}
|
|
483
552
|
|
|
484
553
|
attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
|
|
485
554
|
void this.audioStream.close();
|
|
486
555
|
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
487
556
|
|
|
557
|
+
// Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
|
|
558
|
+
// than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
|
|
559
|
+
// if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
|
|
560
|
+
// and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
|
|
561
|
+
const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
|
|
562
|
+
transform: (frame, controller) => {
|
|
563
|
+
const shouldDiscardForAecWarmup =
|
|
564
|
+
this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
|
|
565
|
+
if (!shouldDiscardForAecWarmup) {
|
|
566
|
+
controller.enqueue(frame);
|
|
567
|
+
}
|
|
568
|
+
},
|
|
569
|
+
});
|
|
570
|
+
|
|
488
571
|
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
489
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
|
|
572
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
|
|
573
|
+
.pipeThrough(aecWarmupAudioFilter)
|
|
574
|
+
.tee();
|
|
490
575
|
|
|
491
576
|
if (this.realtimeSession) {
|
|
492
577
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
@@ -639,6 +724,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
639
724
|
|
|
640
725
|
if (!this.vad) {
|
|
641
726
|
this.agentSession._updateUserState('speaking');
|
|
727
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
728
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
729
|
+
0,
|
|
730
|
+
Date.now(),
|
|
731
|
+
this.agentSession._userSpeakingSpan,
|
|
732
|
+
);
|
|
733
|
+
}
|
|
642
734
|
}
|
|
643
735
|
|
|
644
736
|
// this.interrupt() is going to raise when allow_interruptions is False,
|
|
@@ -657,6 +749,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
657
749
|
this.logger.info(ev, 'onInputSpeechStopped');
|
|
658
750
|
|
|
659
751
|
if (!this.vad) {
|
|
752
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
753
|
+
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
|
|
754
|
+
}
|
|
660
755
|
this.agentSession._updateUserState('listening');
|
|
661
756
|
}
|
|
662
757
|
|
|
@@ -730,15 +825,32 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
730
825
|
onStartOfSpeech(ev: VADEvent): void {
|
|
731
826
|
let speechStartTime = Date.now();
|
|
732
827
|
if (ev) {
|
|
733
|
-
|
|
828
|
+
// Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
|
|
829
|
+
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
734
830
|
}
|
|
735
831
|
this.agentSession._updateUserState('speaking', speechStartTime);
|
|
832
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
833
|
+
// Pass speechStartTime as the absolute startedAt timestamp.
|
|
834
|
+
this.audioRecognition.onStartOfOverlapSpeech(
|
|
835
|
+
ev.speechDuration,
|
|
836
|
+
speechStartTime,
|
|
837
|
+
this.agentSession._userSpeakingSpan,
|
|
838
|
+
);
|
|
839
|
+
}
|
|
736
840
|
}
|
|
737
841
|
|
|
738
842
|
onEndOfSpeech(ev: VADEvent): void {
|
|
739
843
|
let speechEndTime = Date.now();
|
|
740
844
|
if (ev) {
|
|
741
|
-
|
|
845
|
+
// Subtract both silenceDuration and inferenceDuration to correct for VAD model latency.
|
|
846
|
+
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
|
|
847
|
+
}
|
|
848
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
849
|
+
// Pass speechEndTime as the absolute endedAt timestamp.
|
|
850
|
+
this.audioRecognition.onEndOfOverlapSpeech(
|
|
851
|
+
speechEndTime,
|
|
852
|
+
this.agentSession._userSpeakingSpan,
|
|
853
|
+
);
|
|
742
854
|
}
|
|
743
855
|
this.agentSession._updateUserState('listening', speechEndTime);
|
|
744
856
|
}
|
|
@@ -749,12 +861,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
749
861
|
return;
|
|
750
862
|
}
|
|
751
863
|
|
|
752
|
-
if (ev.speechDuration >= this.agentSession.options.
|
|
864
|
+
if (ev.speechDuration >= this.agentSession.options.turnHandling.interruption?.minDuration) {
|
|
753
865
|
this.interruptByAudioActivity();
|
|
754
866
|
}
|
|
755
867
|
}
|
|
756
868
|
|
|
757
869
|
private interruptByAudioActivity(): void {
|
|
870
|
+
if (!this.isInterruptionByAudioActivityEnabled) {
|
|
871
|
+
return;
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
875
|
+
// Disable interruption from audio activity while AEC warmup is active.
|
|
876
|
+
return;
|
|
877
|
+
}
|
|
878
|
+
|
|
758
879
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
759
880
|
// skip speech handle interruption if server side turn detection is enabled
|
|
760
881
|
return;
|
|
@@ -764,7 +885,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
764
885
|
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
|
|
765
886
|
// - Apply check to all STT results: empty string, undefined, or any length
|
|
766
887
|
// - This ensures consistent behavior across all interruption scenarios
|
|
767
|
-
if (
|
|
888
|
+
if (
|
|
889
|
+
this.stt &&
|
|
890
|
+
this.agentSession.options.turnHandling.interruption?.minWords > 0 &&
|
|
891
|
+
this.audioRecognition
|
|
892
|
+
) {
|
|
768
893
|
const text = this.audioRecognition.currentTranscript;
|
|
769
894
|
// TODO(shubhra): better word splitting for multi-language
|
|
770
895
|
|
|
@@ -774,7 +899,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
774
899
|
|
|
775
900
|
// Only allow interruption if word count meets or exceeds minInterruptionWords
|
|
776
901
|
// This applies to all cases: empty strings, partial speech, and full speech
|
|
777
|
-
if (wordCount < this.agentSession.options.
|
|
902
|
+
if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
|
|
778
903
|
return;
|
|
779
904
|
}
|
|
780
905
|
}
|
|
@@ -795,6 +920,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
795
920
|
}
|
|
796
921
|
}
|
|
797
922
|
|
|
923
|
+
onInterruption(ev: OverlappingSpeechEvent) {
|
|
924
|
+
this.restoreInterruptionByAudioActivity();
|
|
925
|
+
this.interruptByAudioActivity();
|
|
926
|
+
if (this.audioRecognition) {
|
|
927
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
|
|
798
931
|
onInterimTranscript(ev: SpeechEvent): void {
|
|
799
932
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
800
933
|
// skip stt transcription if userTranscription is enabled on the realtime model
|
|
@@ -870,6 +1003,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
870
1003
|
const userMessage = ChatMessage.create({
|
|
871
1004
|
role: 'user',
|
|
872
1005
|
content: info.newTranscript,
|
|
1006
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
873
1007
|
});
|
|
874
1008
|
const chatCtx = this.agent.chatCtx.copy();
|
|
875
1009
|
const speechHandle = this.generateReply({
|
|
@@ -965,16 +1099,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
965
1099
|
this._currentSpeech &&
|
|
966
1100
|
this._currentSpeech.allowInterruptions &&
|
|
967
1101
|
!this._currentSpeech.interrupted &&
|
|
968
|
-
this.agentSession.options.
|
|
1102
|
+
this.agentSession.options.turnHandling.interruption?.minWords > 0
|
|
969
1103
|
) {
|
|
970
1104
|
const wordCount = splitWords(info.newTranscript, true).length;
|
|
971
|
-
if (wordCount < this.agentSession.options.
|
|
1105
|
+
if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
|
|
972
1106
|
// avoid interruption if the new_transcript contains fewer words than minInterruptionWords
|
|
973
1107
|
this.cancelPreemptiveGeneration();
|
|
974
1108
|
this.logger.info(
|
|
975
1109
|
{
|
|
976
1110
|
wordCount,
|
|
977
|
-
minInterruptionWords: this.agentSession.options.
|
|
1111
|
+
minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords,
|
|
978
1112
|
},
|
|
979
1113
|
'skipping user input, word count below minimum interruption threshold',
|
|
980
1114
|
);
|
|
@@ -1272,6 +1406,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1272
1406
|
let userMessage: ChatMessage | undefined = ChatMessage.create({
|
|
1273
1407
|
role: 'user',
|
|
1274
1408
|
content: info.newTranscript,
|
|
1409
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
1275
1410
|
});
|
|
1276
1411
|
|
|
1277
1412
|
// create a temporary mutable chat context to pass to onUserTurnCompleted
|
|
@@ -1298,6 +1433,24 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1298
1433
|
return;
|
|
1299
1434
|
}
|
|
1300
1435
|
|
|
1436
|
+
const userMetricsReport: MetricsReport = {};
|
|
1437
|
+
if (info.startedSpeakingAt !== undefined) {
|
|
1438
|
+
userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1000; // ms -> seconds
|
|
1439
|
+
}
|
|
1440
|
+
if (info.stoppedSpeakingAt !== undefined) {
|
|
1441
|
+
userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1000; // ms -> seconds
|
|
1442
|
+
}
|
|
1443
|
+
if (info.transcriptionDelay !== undefined) {
|
|
1444
|
+
userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1000; // ms -> seconds
|
|
1445
|
+
}
|
|
1446
|
+
if (info.endOfUtteranceDelay !== undefined) {
|
|
1447
|
+
userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1000; // ms -> seconds
|
|
1448
|
+
}
|
|
1449
|
+
userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1000; // ms -> seconds
|
|
1450
|
+
if (userMessage) {
|
|
1451
|
+
userMessage.metrics = userMetricsReport;
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1301
1454
|
let speechHandle: SpeechHandle | undefined;
|
|
1302
1455
|
if (this._preemptiveGeneration !== undefined) {
|
|
1303
1456
|
const preemptive = this._preemptiveGeneration;
|
|
@@ -1310,6 +1463,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1310
1463
|
isSameToolChoice(preemptive.toolChoice, this.toolChoice)
|
|
1311
1464
|
) {
|
|
1312
1465
|
speechHandle = preemptive.speechHandle;
|
|
1466
|
+
// The preemptive userMessage was created without metrics.
|
|
1467
|
+
// Copy the metrics and transcriptConfidence from the new userMessage
|
|
1468
|
+
// to the preemptive message BEFORE scheduling (so the pipeline inserts
|
|
1469
|
+
// the message with metrics already set).
|
|
1470
|
+
if (preemptive.userMessage && userMessage) {
|
|
1471
|
+
preemptive.userMessage.metrics = userMetricsReport;
|
|
1472
|
+
preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
|
|
1473
|
+
}
|
|
1313
1474
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
1314
1475
|
this.logger.debug(
|
|
1315
1476
|
{
|
|
@@ -1403,11 +1564,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1403
1564
|
tasks.push(textForwardTask);
|
|
1404
1565
|
}
|
|
1405
1566
|
|
|
1567
|
+
let replyStartedSpeakingAt: number | undefined;
|
|
1568
|
+
let replyTtsGenData: _TTSGenerationData | null = null;
|
|
1569
|
+
|
|
1406
1570
|
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1571
|
+
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1407
1572
|
this.agentSession._updateAgentState('speaking', {
|
|
1408
1573
|
startTime: startedSpeakingAt,
|
|
1409
1574
|
otelContext: speechHandle._agentTurnContext,
|
|
1410
1575
|
});
|
|
1576
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1577
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1578
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1579
|
+
}
|
|
1411
1580
|
};
|
|
1412
1581
|
|
|
1413
1582
|
if (!audioOutput) {
|
|
@@ -1425,8 +1594,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1425
1594
|
audioSource,
|
|
1426
1595
|
modelSettings,
|
|
1427
1596
|
replyAbortController,
|
|
1597
|
+
this.tts?.model,
|
|
1598
|
+
this.tts?.provider,
|
|
1428
1599
|
);
|
|
1429
1600
|
tasks.push(ttsTask);
|
|
1601
|
+
replyTtsGenData = ttsGenData;
|
|
1430
1602
|
|
|
1431
1603
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1432
1604
|
ttsGenData.audioStream,
|
|
@@ -1466,10 +1638,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1466
1638
|
}
|
|
1467
1639
|
|
|
1468
1640
|
if (addToChatCtx) {
|
|
1641
|
+
const replyStoppedSpeakingAt = Date.now();
|
|
1642
|
+
const replyAssistantMetrics: MetricsReport = {};
|
|
1643
|
+
if (replyTtsGenData?.ttfb !== undefined) {
|
|
1644
|
+
replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
|
|
1645
|
+
}
|
|
1646
|
+
if (replyStartedSpeakingAt !== undefined) {
|
|
1647
|
+
replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds
|
|
1648
|
+
replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds
|
|
1649
|
+
}
|
|
1650
|
+
|
|
1469
1651
|
const message = ChatMessage.create({
|
|
1470
1652
|
role: 'assistant',
|
|
1471
1653
|
content: textOut?.text || '',
|
|
1472
1654
|
interrupted: speechHandle.interrupted,
|
|
1655
|
+
metrics: replyAssistantMetrics,
|
|
1473
1656
|
});
|
|
1474
1657
|
this.agent._chatCtx.insert(message);
|
|
1475
1658
|
this.agentSession._conversationItemAdded(message);
|
|
@@ -1477,6 +1660,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1477
1660
|
|
|
1478
1661
|
if (this.agentSession.agentState === 'speaking') {
|
|
1479
1662
|
this.agentSession._updateAgentState('listening');
|
|
1663
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1664
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1665
|
+
}
|
|
1666
|
+
this.restoreInterruptionByAudioActivity();
|
|
1480
1667
|
}
|
|
1481
1668
|
}
|
|
1482
1669
|
|
|
@@ -1490,6 +1677,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1490
1677
|
newMessage,
|
|
1491
1678
|
toolsMessages,
|
|
1492
1679
|
span,
|
|
1680
|
+
_previousUserMetrics,
|
|
1493
1681
|
}: {
|
|
1494
1682
|
speechHandle: SpeechHandle;
|
|
1495
1683
|
chatCtx: ChatContext;
|
|
@@ -1500,6 +1688,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1500
1688
|
newMessage?: ChatMessage;
|
|
1501
1689
|
toolsMessages?: ChatItem[];
|
|
1502
1690
|
span: Span;
|
|
1691
|
+
_previousUserMetrics?: MetricsReport;
|
|
1503
1692
|
}): Promise<void> => {
|
|
1504
1693
|
speechHandle._agentTurnContext = otelContext.active();
|
|
1505
1694
|
|
|
@@ -1552,6 +1741,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1552
1741
|
toolCtx,
|
|
1553
1742
|
modelSettings,
|
|
1554
1743
|
replyAbortController,
|
|
1744
|
+
this.llm?.model,
|
|
1745
|
+
this.llm?.provider,
|
|
1555
1746
|
);
|
|
1556
1747
|
tasks.push(llmTask);
|
|
1557
1748
|
|
|
@@ -1568,6 +1759,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1568
1759
|
ttsTextInput,
|
|
1569
1760
|
modelSettings,
|
|
1570
1761
|
replyAbortController,
|
|
1762
|
+
this.tts?.model,
|
|
1763
|
+
this.tts?.provider,
|
|
1571
1764
|
);
|
|
1572
1765
|
tasks.push(ttsTask);
|
|
1573
1766
|
} else {
|
|
@@ -1577,10 +1770,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1577
1770
|
|
|
1578
1771
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1579
1772
|
|
|
1773
|
+
let userMetrics: MetricsReport | undefined = _previousUserMetrics;
|
|
1580
1774
|
// Add new message to actual chat context if the speech is scheduled
|
|
1581
1775
|
if (newMessage && speechHandle.scheduled) {
|
|
1582
1776
|
this.agent._chatCtx.insert(newMessage);
|
|
1583
1777
|
this.agentSession._conversationItemAdded(newMessage);
|
|
1778
|
+
userMetrics = newMessage.metrics;
|
|
1584
1779
|
}
|
|
1585
1780
|
|
|
1586
1781
|
if (speechHandle.interrupted) {
|
|
@@ -1626,11 +1821,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1626
1821
|
textOut = _textOut;
|
|
1627
1822
|
}
|
|
1628
1823
|
|
|
1824
|
+
let agentStartedSpeakingAt: number | undefined;
|
|
1629
1825
|
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1826
|
+
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
|
|
1630
1827
|
this.agentSession._updateAgentState('speaking', {
|
|
1631
1828
|
startTime: startedSpeakingAt,
|
|
1632
1829
|
otelContext: speechHandle._agentTurnContext,
|
|
1633
1830
|
});
|
|
1831
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1832
|
+
this.audioRecognition.onStartOfAgentSpeech();
|
|
1833
|
+
this.isInterruptionByAudioActivityEnabled = false;
|
|
1834
|
+
}
|
|
1634
1835
|
};
|
|
1635
1836
|
|
|
1636
1837
|
let audioOut: _AudioOut | null = null;
|
|
@@ -1687,6 +1888,29 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1687
1888
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1688
1889
|
}
|
|
1689
1890
|
|
|
1891
|
+
const agentStoppedSpeakingAt = Date.now();
|
|
1892
|
+
const assistantMetrics: MetricsReport = {};
|
|
1893
|
+
|
|
1894
|
+
if (llmGenData.ttft !== undefined) {
|
|
1895
|
+
assistantMetrics.llmNodeTtft = llmGenData.ttft; // already in seconds
|
|
1896
|
+
}
|
|
1897
|
+
if (ttsGenData?.ttfb !== undefined) {
|
|
1898
|
+
assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb; // already in seconds
|
|
1899
|
+
}
|
|
1900
|
+
if (agentStartedSpeakingAt !== undefined) {
|
|
1901
|
+
assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds
|
|
1902
|
+
assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds
|
|
1903
|
+
|
|
1904
|
+
if (userMetrics?.stoppedSpeakingAt !== undefined) {
|
|
1905
|
+
const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt;
|
|
1906
|
+
assistantMetrics.e2eLatency = e2eLatency;
|
|
1907
|
+
span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
|
|
1908
|
+
}
|
|
1909
|
+
}
|
|
1910
|
+
|
|
1911
|
+
span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
|
|
1912
|
+
let hasSpeechMessage = false;
|
|
1913
|
+
|
|
1690
1914
|
// add the tools messages that triggers this reply to the chat context
|
|
1691
1915
|
if (toolsMessages) {
|
|
1692
1916
|
for (const msg of toolsMessages) {
|
|
@@ -1741,45 +1965,54 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1741
1965
|
}
|
|
1742
1966
|
|
|
1743
1967
|
if (forwardedText) {
|
|
1968
|
+
hasSpeechMessage = true;
|
|
1744
1969
|
const message = ChatMessage.create({
|
|
1745
1970
|
role: 'assistant',
|
|
1746
1971
|
content: forwardedText,
|
|
1747
1972
|
id: llmGenData.id,
|
|
1748
1973
|
interrupted: true,
|
|
1749
1974
|
createdAt: replyStartedAt,
|
|
1975
|
+
metrics: assistantMetrics,
|
|
1750
1976
|
});
|
|
1751
1977
|
chatCtx.insert(message);
|
|
1752
1978
|
this.agent._chatCtx.insert(message);
|
|
1753
1979
|
speechHandle._itemAdded([message]);
|
|
1754
1980
|
this.agentSession._conversationItemAdded(message);
|
|
1981
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
|
|
1755
1982
|
}
|
|
1756
1983
|
|
|
1757
1984
|
if (this.agentSession.agentState === 'speaking') {
|
|
1758
1985
|
this.agentSession._updateAgentState('listening');
|
|
1986
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
1987
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
1988
|
+
this.restoreInterruptionByAudioActivity();
|
|
1989
|
+
}
|
|
1759
1990
|
}
|
|
1760
1991
|
|
|
1761
1992
|
this.logger.info(
|
|
1762
1993
|
{ speech_id: speechHandle.id, message: forwardedText },
|
|
1763
1994
|
'playout completed with interrupt',
|
|
1764
1995
|
);
|
|
1765
|
-
// TODO(shubhra) add chat message to speech handle
|
|
1766
1996
|
speechHandle._markGenerationDone();
|
|
1767
1997
|
await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1768
1998
|
return;
|
|
1769
1999
|
}
|
|
1770
2000
|
|
|
1771
2001
|
if (textOut && textOut.text) {
|
|
2002
|
+
hasSpeechMessage = true;
|
|
1772
2003
|
const message = ChatMessage.create({
|
|
1773
2004
|
role: 'assistant',
|
|
1774
2005
|
id: llmGenData.id,
|
|
1775
2006
|
interrupted: false,
|
|
1776
2007
|
createdAt: replyStartedAt,
|
|
1777
2008
|
content: textOut.text,
|
|
2009
|
+
metrics: assistantMetrics,
|
|
1778
2010
|
});
|
|
1779
2011
|
chatCtx.insert(message);
|
|
1780
2012
|
this.agent._chatCtx.insert(message);
|
|
1781
2013
|
speechHandle._itemAdded([message]);
|
|
1782
2014
|
this.agentSession._conversationItemAdded(message);
|
|
2015
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
|
|
1783
2016
|
this.logger.info(
|
|
1784
2017
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
1785
2018
|
'playout completed without interruption',
|
|
@@ -1790,6 +2023,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1790
2023
|
this.agentSession._updateAgentState('thinking');
|
|
1791
2024
|
} else if (this.agentSession.agentState === 'speaking') {
|
|
1792
2025
|
this.agentSession._updateAgentState('listening');
|
|
2026
|
+
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
2027
|
+
{
|
|
2028
|
+
this.audioRecognition.onEndOfAgentSpeech(Date.now());
|
|
2029
|
+
this.restoreInterruptionByAudioActivity();
|
|
2030
|
+
}
|
|
2031
|
+
}
|
|
1793
2032
|
}
|
|
1794
2033
|
|
|
1795
2034
|
// mark the playout done before waiting for the tool execution
|
|
@@ -1849,6 +2088,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1849
2088
|
instructions,
|
|
1850
2089
|
undefined,
|
|
1851
2090
|
toolMessages,
|
|
2091
|
+
hasSpeechMessage ? undefined : userMetrics,
|
|
1852
2092
|
),
|
|
1853
2093
|
ownedSpeechHandle: speechHandle,
|
|
1854
2094
|
name: 'AgentActivity.pipelineReply',
|
|
@@ -1882,6 +2122,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1882
2122
|
instructions?: string,
|
|
1883
2123
|
newMessage?: ChatMessage,
|
|
1884
2124
|
toolsMessages?: ChatItem[],
|
|
2125
|
+
_previousUserMetrics?: MetricsReport,
|
|
1885
2126
|
): Promise<void> =>
|
|
1886
2127
|
tracer.startActiveSpan(
|
|
1887
2128
|
async (span) =>
|
|
@@ -1895,6 +2136,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1895
2136
|
newMessage,
|
|
1896
2137
|
toolsMessages,
|
|
1897
2138
|
span,
|
|
2139
|
+
_previousUserMetrics,
|
|
1898
2140
|
}),
|
|
1899
2141
|
{
|
|
1900
2142
|
name: 'agent_turn',
|
|
@@ -2045,6 +2287,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2045
2287
|
ttsTextInput,
|
|
2046
2288
|
modelSettings,
|
|
2047
2289
|
abortController,
|
|
2290
|
+
this.tts?.model,
|
|
2291
|
+
this.tts?.provider,
|
|
2048
2292
|
);
|
|
2049
2293
|
tasks.push(ttsTask);
|
|
2050
2294
|
realtimeAudioResult = ttsGenData.audioStream;
|
|
@@ -2554,6 +2798,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2554
2798
|
if (this._mainTask) {
|
|
2555
2799
|
await this._mainTask.cancelAndWait();
|
|
2556
2800
|
}
|
|
2801
|
+
if (this.interruptionDetector) {
|
|
2802
|
+
this.interruptionDetector.off(
|
|
2803
|
+
'user_overlapping_speech',
|
|
2804
|
+
this.onInterruptionOverlappingSpeech,
|
|
2805
|
+
);
|
|
2806
|
+
this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2807
|
+
this.interruptionDetector.off('error', this.onInterruptionError);
|
|
2808
|
+
}
|
|
2557
2809
|
|
|
2558
2810
|
this.agent._agentActivity = undefined;
|
|
2559
2811
|
} finally {
|
|
@@ -2561,6 +2813,53 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2561
2813
|
}
|
|
2562
2814
|
}
|
|
2563
2815
|
|
|
2816
|
+
private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
|
|
2817
|
+
const interruptionDetection =
|
|
2818
|
+
this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
|
|
2819
|
+
if (
|
|
2820
|
+
!(
|
|
2821
|
+
this.stt &&
|
|
2822
|
+
this.stt.capabilities.alignedTranscript &&
|
|
2823
|
+
this.stt.capabilities.streaming &&
|
|
2824
|
+
this.vad &&
|
|
2825
|
+
this.turnDetection !== 'manual' &&
|
|
2826
|
+
this.turnDetection !== 'realtime_llm' &&
|
|
2827
|
+
!(this.llm instanceof RealtimeModel)
|
|
2828
|
+
)
|
|
2829
|
+
) {
|
|
2830
|
+
if (interruptionDetection === 'adaptive') {
|
|
2831
|
+
this.logger.warn(
|
|
2832
|
+
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
|
|
2833
|
+
);
|
|
2834
|
+
return undefined;
|
|
2835
|
+
}
|
|
2836
|
+
}
|
|
2837
|
+
|
|
2838
|
+
if (
|
|
2839
|
+
(interruptionDetection !== undefined && interruptionDetection === false) ||
|
|
2840
|
+
interruptionDetection === 'vad'
|
|
2841
|
+
) {
|
|
2842
|
+
return undefined;
|
|
2843
|
+
}
|
|
2844
|
+
|
|
2845
|
+
try {
|
|
2846
|
+
const detector = new AdaptiveInterruptionDetector();
|
|
2847
|
+
|
|
2848
|
+
detector.on('user_overlapping_speech', this.onInterruptionOverlappingSpeech);
|
|
2849
|
+
detector.on('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2850
|
+
detector.on('error', this.onInterruptionError);
|
|
2851
|
+
|
|
2852
|
+
return detector;
|
|
2853
|
+
} catch (error: unknown) {
|
|
2854
|
+
this.logger.warn({ error }, 'could not instantiate AdaptiveInterruptionDetector');
|
|
2855
|
+
}
|
|
2856
|
+
return undefined;
|
|
2857
|
+
}
|
|
2858
|
+
|
|
2859
|
+
private restoreInterruptionByAudioActivity(): void {
|
|
2860
|
+
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2861
|
+
}
|
|
2862
|
+
|
|
2564
2863
|
private async _closeSessionResources(): Promise<void> {
|
|
2565
2864
|
// Unregister event handlers to prevent duplicate metrics
|
|
2566
2865
|
if (this.llm instanceof LLM) {
|