@livekit/agents 1.0.48 → 1.1.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.cjs +27 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +9 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +18 -0
- package/dist/constants.js.map +1 -1
- package/dist/inference/api_protos.d.cts +71 -71
- package/dist/inference/api_protos.d.ts +71 -71
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +147 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +121 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +181 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +147 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +329 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +295 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +14 -10
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +2 -1
- package/dist/inference/llm.d.ts +2 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +8 -10
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +7 -2
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +2 -0
- package/dist/inference/stt.d.ts +2 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +8 -3
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +7 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -0
- package/dist/inference/tts.d.ts +2 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +8 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +13 -0
- package/dist/inference/utils.d.ts +13 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/llm/chat_context.cjs +20 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +19 -1
- package/dist/llm/chat_context.d.ts +19 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +20 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +16 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +9 -0
- package/dist/llm/llm.d.ts +9 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +16 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +45 -1
- package/dist/metrics/base.d.ts +45 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +3 -0
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +9 -0
- package/dist/metrics/usage_collector.d.ts +9 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +3 -0
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +9 -0
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +9 -0
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/multi_input_stream.test.cjs +4 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -1
- package/dist/stream/multi_input_stream.test.js +5 -1
- package/dist/stream/multi_input_stream.test.js.map +1 -1
- package/dist/stream/stream_channel.cjs +31 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +4 -2
- package/dist/stream/stream_channel.d.ts +4 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +31 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stt/stt.cjs +34 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +22 -0
- package/dist/stt/stt.d.ts +22 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +34 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/otel_http_exporter.cjs +24 -5
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
- package/dist/telemetry/otel_http_exporter.d.cts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
- package/dist/telemetry/otel_http_exporter.js +24 -5
- package/dist/telemetry/otel_http_exporter.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +5 -5
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +9 -5
- package/dist/telemetry/trace_types.d.ts +9 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +5 -5
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/telemetry/traces.cjs +47 -8
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +47 -8
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +64 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +34 -0
- package/dist/tts/tts.d.ts +34 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +64 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +25 -4
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +10 -2
- package/dist/voice/agent.d.ts +10 -2
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +25 -4
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +261 -36
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +20 -6
- package/dist/voice/agent_activity.d.ts +20 -6
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +262 -37
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -48
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +90 -20
- package/dist/voice/agent_session.d.ts +90 -20
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +105 -46
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +287 -6
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +42 -3
- package/dist/voice/audio_recognition.d.ts +42 -3
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +289 -7
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/client_events.cjs +554 -0
- package/dist/voice/client_events.cjs.map +1 -0
- package/dist/voice/client_events.d.cts +195 -0
- package/dist/voice/client_events.d.ts +195 -0
- package/dist/voice/client_events.d.ts.map +1 -0
- package/dist/voice/client_events.js +548 -0
- package/dist/voice/client_events.js.map +1 -0
- package/dist/voice/events.cjs +1 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +8 -5
- package/dist/voice/events.d.ts +8 -5
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +43 -8
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -3
- package/dist/voice/generation.d.ts +3 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/report.cjs +20 -8
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -0
- package/dist/voice/report.d.ts +5 -0
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +20 -8
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +106 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +105 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +5 -39
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +4 -9
- package/dist/voice/room_io/room_io.d.ts +4 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +5 -40
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +97 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +25 -0
- package/dist/voice/turn_config/utils.d.ts +25 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +73 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +86 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +85 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/wire_format.cjs +798 -0
- package/dist/voice/wire_format.cjs.map +1 -0
- package/dist/voice/wire_format.d.cts +5503 -0
- package/dist/voice/wire_format.d.ts +5503 -0
- package/dist/voice/wire_format.d.ts.map +1 -0
- package/dist/voice/wire_format.js +728 -0
- package/dist/voice/wire_format.js.map +1 -0
- package/package.json +2 -1
- package/src/constants.ts +13 -0
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +187 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +188 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +402 -0
- package/src/inference/llm.ts +9 -12
- package/src/inference/stt.ts +10 -3
- package/src/inference/tts.ts +10 -3
- package/src/inference/utils.ts +29 -1
- package/src/llm/chat_context.ts +40 -2
- package/src/llm/index.ts +1 -0
- package/src/llm/llm.ts +16 -0
- package/src/llm/realtime.ts +4 -0
- package/src/metrics/base.ts +48 -1
- package/src/metrics/index.ts +11 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +11 -0
- package/src/metrics/utils.ts +11 -0
- package/src/stream/multi_input_stream.test.ts +6 -1
- package/src/stream/stream_channel.ts +34 -2
- package/src/stt/stt.ts +38 -0
- package/src/telemetry/otel_http_exporter.ts +28 -5
- package/src/telemetry/trace_types.ts +11 -8
- package/src/telemetry/traces.ts +111 -54
- package/src/tts/tts.ts +69 -1
- package/src/voice/agent.ts +30 -3
- package/src/voice/agent_activity.ts +327 -28
- package/src/voice/agent_session.ts +207 -59
- package/src/voice/audio_recognition.ts +385 -9
- package/src/voice/client_events.ts +838 -0
- package/src/voice/events.ts +14 -4
- package/src/voice/generation.ts +52 -9
- package/src/voice/index.ts +1 -0
- package/src/voice/report.test.ts +117 -0
- package/src/voice/report.ts +29 -6
- package/src/voice/room_io/room_io.ts +7 -61
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +100 -0
- package/src/voice/turn_config/utils.ts +103 -0
- package/src/voice/wire_format.ts +827 -0
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
/// <reference types="node" resolution-mode="require"/>
|
|
2
2
|
import type { ParticipantKind } from '@livekit/rtc-node';
|
|
3
3
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
4
|
-
import { type Context } from '@opentelemetry/api';
|
|
4
|
+
import { type Context, type Span } from '@opentelemetry/api';
|
|
5
5
|
import { ReadableStream } from 'node:stream/web';
|
|
6
|
+
import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
|
|
7
|
+
import { type OverlappingSpeechEvent } from '../inference/interruption/types.js';
|
|
6
8
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
7
9
|
import { type SpeechEvent } from '../stt/stt.js';
|
|
8
10
|
import { type VAD, type VADEvent } from '../vad.js';
|
|
@@ -27,6 +29,7 @@ export interface PreemptiveGenerationInfo {
|
|
|
27
29
|
transcriptConfidence: number;
|
|
28
30
|
}
|
|
29
31
|
export interface RecognitionHooks {
|
|
32
|
+
onInterruption: (ev: OverlappingSpeechEvent) => void;
|
|
30
33
|
onStartOfSpeech: (ev: VADEvent) => void;
|
|
31
34
|
onVADInferenceDone: (ev: VADEvent) => void;
|
|
32
35
|
onEndOfSpeech: (ev: VADEvent) => void;
|
|
@@ -37,9 +40,13 @@ export interface RecognitionHooks {
|
|
|
37
40
|
retrieveChatCtx: () => ChatContext;
|
|
38
41
|
}
|
|
39
42
|
export interface _TurnDetector {
|
|
43
|
+
/** The model name used by this turn detector. */
|
|
44
|
+
readonly model: string;
|
|
45
|
+
/** The provider name for this turn detector. */
|
|
46
|
+
readonly provider: string;
|
|
40
47
|
unlikelyThreshold: (language?: string) => Promise<number | undefined>;
|
|
41
48
|
supportsLanguage: (language?: string) => Promise<boolean>;
|
|
42
|
-
predictEndOfTurn(chatCtx: ChatContext): Promise<number>;
|
|
49
|
+
predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise<number>;
|
|
43
50
|
}
|
|
44
51
|
export interface AudioRecognitionOptions {
|
|
45
52
|
/** Hooks for recognition events. */
|
|
@@ -51,7 +58,8 @@ export interface AudioRecognitionOptions {
|
|
|
51
58
|
/** Turn detector for end-of-turn prediction. */
|
|
52
59
|
turnDetector?: _TurnDetector;
|
|
53
60
|
/** Turn detection mode. */
|
|
54
|
-
turnDetectionMode?:
|
|
61
|
+
turnDetectionMode?: TurnDetectionMode;
|
|
62
|
+
interruptionDetection?: AdaptiveInterruptionDetector;
|
|
55
63
|
/** Minimum endpointing delay in milliseconds. */
|
|
56
64
|
minEndpointingDelay: number;
|
|
57
65
|
/** Maximum endpointing delay in milliseconds. */
|
|
@@ -75,6 +83,7 @@ export interface ParticipantLike {
|
|
|
75
83
|
kind: ParticipantKind;
|
|
76
84
|
}
|
|
77
85
|
export declare class AudioRecognition {
|
|
86
|
+
#private;
|
|
78
87
|
private hooks;
|
|
79
88
|
private stt?;
|
|
80
89
|
private vad?;
|
|
@@ -108,18 +117,48 @@ export declare class AudioRecognition {
|
|
|
108
117
|
private commitUserTurnTask?;
|
|
109
118
|
private vadTask?;
|
|
110
119
|
private sttTask?;
|
|
120
|
+
private interruptionTask?;
|
|
121
|
+
private interruptionDetection?;
|
|
122
|
+
private _inputStartedAt?;
|
|
123
|
+
private ignoreUserTranscriptUntil?;
|
|
124
|
+
private transcriptBuffer;
|
|
125
|
+
private isInterruptionEnabled;
|
|
126
|
+
private isAgentSpeaking;
|
|
127
|
+
private interruptionStreamChannel?;
|
|
111
128
|
constructor(opts: AudioRecognitionOptions);
|
|
112
129
|
/**
|
|
113
130
|
* Current transcript of the user's speech, including interim transcript if available.
|
|
114
131
|
*/
|
|
115
132
|
get currentTranscript(): string;
|
|
133
|
+
/** @internal */
|
|
134
|
+
get inputStartedAt(): number | undefined;
|
|
135
|
+
/** @internal */
|
|
136
|
+
updateOptions(options: {
|
|
137
|
+
turnDetection: TurnDetectionMode | undefined;
|
|
138
|
+
}): void;
|
|
116
139
|
start(): Promise<void>;
|
|
140
|
+
stop(): Promise<void>;
|
|
141
|
+
onStartOfAgentSpeech(): Promise<boolean>;
|
|
142
|
+
onEndOfAgentSpeech(ignoreUserTranscriptUntil: number): Promise<void>;
|
|
143
|
+
/** Start interruption inference when agent is speaking and overlap speech starts. */
|
|
144
|
+
onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span): Promise<void>;
|
|
145
|
+
/** End interruption inference when overlap speech ends. */
|
|
146
|
+
onEndOfOverlapSpeech(endedAt: number, userSpeakingSpan?: Span): Promise<boolean | undefined>;
|
|
147
|
+
/**
|
|
148
|
+
* Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp.
|
|
149
|
+
* If the event has no timestamps, we assume it is the same as the next valid event.
|
|
150
|
+
*/
|
|
151
|
+
private flushHeldTranscripts;
|
|
152
|
+
private shouldHoldSttEvent;
|
|
153
|
+
private trySendInterruptionSentinel;
|
|
117
154
|
private ensureUserTurnSpan;
|
|
118
155
|
private userTurnContext;
|
|
119
156
|
private onSTTEvent;
|
|
157
|
+
private onOverlapSpeechEvent;
|
|
120
158
|
private runEOUDetection;
|
|
121
159
|
private createSttTask;
|
|
122
160
|
private createVadTask;
|
|
161
|
+
private createInterruptionTask;
|
|
123
162
|
setInputAudioStream(audioStream: ReadableStream<AudioFrame>): void;
|
|
124
163
|
detachInputAudioStream(): void;
|
|
125
164
|
clearUserTurn(): void;
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
/// <reference types="node" resolution-mode="require"/>
|
|
2
2
|
import type { ParticipantKind } from '@livekit/rtc-node';
|
|
3
3
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
4
|
-
import { type Context } from '@opentelemetry/api';
|
|
4
|
+
import { type Context, type Span } from '@opentelemetry/api';
|
|
5
5
|
import { ReadableStream } from 'node:stream/web';
|
|
6
|
+
import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
|
|
7
|
+
import { type OverlappingSpeechEvent } from '../inference/interruption/types.js';
|
|
6
8
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
7
9
|
import { type SpeechEvent } from '../stt/stt.js';
|
|
8
10
|
import { type VAD, type VADEvent } from '../vad.js';
|
|
@@ -27,6 +29,7 @@ export interface PreemptiveGenerationInfo {
|
|
|
27
29
|
transcriptConfidence: number;
|
|
28
30
|
}
|
|
29
31
|
export interface RecognitionHooks {
|
|
32
|
+
onInterruption: (ev: OverlappingSpeechEvent) => void;
|
|
30
33
|
onStartOfSpeech: (ev: VADEvent) => void;
|
|
31
34
|
onVADInferenceDone: (ev: VADEvent) => void;
|
|
32
35
|
onEndOfSpeech: (ev: VADEvent) => void;
|
|
@@ -37,9 +40,13 @@ export interface RecognitionHooks {
|
|
|
37
40
|
retrieveChatCtx: () => ChatContext;
|
|
38
41
|
}
|
|
39
42
|
export interface _TurnDetector {
|
|
43
|
+
/** The model name used by this turn detector. */
|
|
44
|
+
readonly model: string;
|
|
45
|
+
/** The provider name for this turn detector. */
|
|
46
|
+
readonly provider: string;
|
|
40
47
|
unlikelyThreshold: (language?: string) => Promise<number | undefined>;
|
|
41
48
|
supportsLanguage: (language?: string) => Promise<boolean>;
|
|
42
|
-
predictEndOfTurn(chatCtx: ChatContext): Promise<number>;
|
|
49
|
+
predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise<number>;
|
|
43
50
|
}
|
|
44
51
|
export interface AudioRecognitionOptions {
|
|
45
52
|
/** Hooks for recognition events. */
|
|
@@ -51,7 +58,8 @@ export interface AudioRecognitionOptions {
|
|
|
51
58
|
/** Turn detector for end-of-turn prediction. */
|
|
52
59
|
turnDetector?: _TurnDetector;
|
|
53
60
|
/** Turn detection mode. */
|
|
54
|
-
turnDetectionMode?:
|
|
61
|
+
turnDetectionMode?: TurnDetectionMode;
|
|
62
|
+
interruptionDetection?: AdaptiveInterruptionDetector;
|
|
55
63
|
/** Minimum endpointing delay in milliseconds. */
|
|
56
64
|
minEndpointingDelay: number;
|
|
57
65
|
/** Maximum endpointing delay in milliseconds. */
|
|
@@ -75,6 +83,7 @@ export interface ParticipantLike {
|
|
|
75
83
|
kind: ParticipantKind;
|
|
76
84
|
}
|
|
77
85
|
export declare class AudioRecognition {
|
|
86
|
+
#private;
|
|
78
87
|
private hooks;
|
|
79
88
|
private stt?;
|
|
80
89
|
private vad?;
|
|
@@ -108,18 +117,48 @@ export declare class AudioRecognition {
|
|
|
108
117
|
private commitUserTurnTask?;
|
|
109
118
|
private vadTask?;
|
|
110
119
|
private sttTask?;
|
|
120
|
+
private interruptionTask?;
|
|
121
|
+
private interruptionDetection?;
|
|
122
|
+
private _inputStartedAt?;
|
|
123
|
+
private ignoreUserTranscriptUntil?;
|
|
124
|
+
private transcriptBuffer;
|
|
125
|
+
private isInterruptionEnabled;
|
|
126
|
+
private isAgentSpeaking;
|
|
127
|
+
private interruptionStreamChannel?;
|
|
111
128
|
constructor(opts: AudioRecognitionOptions);
|
|
112
129
|
/**
|
|
113
130
|
* Current transcript of the user's speech, including interim transcript if available.
|
|
114
131
|
*/
|
|
115
132
|
get currentTranscript(): string;
|
|
133
|
+
/** @internal */
|
|
134
|
+
get inputStartedAt(): number | undefined;
|
|
135
|
+
/** @internal */
|
|
136
|
+
updateOptions(options: {
|
|
137
|
+
turnDetection: TurnDetectionMode | undefined;
|
|
138
|
+
}): void;
|
|
116
139
|
start(): Promise<void>;
|
|
140
|
+
stop(): Promise<void>;
|
|
141
|
+
onStartOfAgentSpeech(): Promise<boolean>;
|
|
142
|
+
onEndOfAgentSpeech(ignoreUserTranscriptUntil: number): Promise<void>;
|
|
143
|
+
/** Start interruption inference when agent is speaking and overlap speech starts. */
|
|
144
|
+
onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span): Promise<void>;
|
|
145
|
+
/** End interruption inference when overlap speech ends. */
|
|
146
|
+
onEndOfOverlapSpeech(endedAt: number, userSpeakingSpan?: Span): Promise<boolean | undefined>;
|
|
147
|
+
/**
|
|
148
|
+
* Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp.
|
|
149
|
+
* If the event has no timestamps, we assume it is the same as the next valid event.
|
|
150
|
+
*/
|
|
151
|
+
private flushHeldTranscripts;
|
|
152
|
+
private shouldHoldSttEvent;
|
|
153
|
+
private trySendInterruptionSentinel;
|
|
117
154
|
private ensureUserTurnSpan;
|
|
118
155
|
private userTurnContext;
|
|
119
156
|
private onSTTEvent;
|
|
157
|
+
private onOverlapSpeechEvent;
|
|
120
158
|
private runEOUDetection;
|
|
121
159
|
private createSttTask;
|
|
122
160
|
private createVadTask;
|
|
161
|
+
private createInterruptionTask;
|
|
123
162
|
setInputAudioStream(audioStream: ReadableStream<AudioFrame>): void;
|
|
124
163
|
detachInputAudioStream(): void;
|
|
125
164
|
clearUserTurn(): void;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"audio_recognition.d.ts","sourceRoot":"","sources":["../../src/voice/audio_recognition.ts"],"names":[],"mappings":";AAGA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACzD,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EACL,KAAK,OAAO,
|
|
1
|
+
{"version":3,"file":"audio_recognition.d.ts","sourceRoot":"","sources":["../../src/voice/audio_recognition.ts"],"names":[],"mappings":";AAGA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACzD,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EACL,KAAK,OAAO,EAEZ,KAAK,IAAI,EAGV,MAAM,oBAAoB,CAAC;AAE5B,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEjD,OAAO,KAAK,EAAE,4BAA4B,EAAE,MAAM,oDAAoD,CAAC;AAEvG,OAAO,EAEL,KAAK,sBAAsB,EAC5B,MAAM,oCAAoC,CAAC;AAC5C,OAAO,EAAE,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAM1D,OAAO,EAAE,KAAK,WAAW,EAAmB,MAAM,eAAe,CAAC;AAGlE,OAAO,EAAE,KAAK,GAAG,EAAE,KAAK,QAAQ,EAAgB,MAAM,WAAW,CAAC;AAClE,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAC5D,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAGvC,MAAM,WAAW,aAAa;IAC5B,sDAAsD;IACtD,aAAa,EAAE,MAAM,CAAC;IACtB,gDAAgD;IAChD,oBAAoB,EAAE,MAAM,CAAC;IAC7B,qEAAqE;IACrE,kBAAkB,EAAE,MAAM,CAAC;IAC3B,4EAA4E;IAC5E,mBAAmB,EAAE,MAAM,CAAC;IAC5B,uEAAuE;IACvE,iBAAiB,EAAE,MAAM,GAAG,SAAS,CAAC;IACtC,uEAAuE;IACvE,iBAAiB,EAAE,MAAM,GAAG,SAAS,CAAC;CACvC;AAED,MAAM,WAAW,wBAAwB;IACvC,aAAa,EAAE,MAAM,CAAC;IACtB,oBAAoB,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,gBAAgB;IAC/B,cAAc,EAAE,CAAC,EAAE,EAAE,sBAAsB,KAAK,IAAI,CAAC;IACrD,eAAe,EAAE,CAAC,EAAE,EAAE,QAAQ,KAAK,IAAI,CAAC;IACxC,kBAAkB,EAAE,CAAC,EAAE,EAAE,QAAQ,KAAK,IAAI,CAAC;IAC3C,aAAa,EAAE,CAAC,EAAE,EAAE,QAAQ,KAAK,IAAI,CAAC;IACtC,mBAAmB,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,IAAI,CAAC;IAC/C,iBAAiB,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,IAAI,CAAC;IAC7C,WAAW,EAAE,CAAC,IAAI,EAAE,aAAa,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IACvD,sBAAsB,EAAE,CAAC,IAAI,EAAE,wBAAwB,KAAK,IAAI,CAAC;IAEjE,eAAe,EAAE,MAAM,WAAW,CAAC;CACpC;AAED,MAAM,WAAW,aAAa;IAC5B,iDAAiD;IACjD,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,gDAAgD;IAChD,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,iBAAiB,EAAE,CAAC,QAAQ,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC,CAAC;IACtE,gBAAgB,EAAE,CAAC,QAAQ,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IAC1D,gBAAgB,CAAC,OAAO,EAAE,WAAW,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CAC3E;AAED,MAAM,WAAW,uBAAuB;IACtC,oCAAoC;IACpC,gBAAgB,EAAE,gBAAgB,CAAC;IACnC,2BAA2B;IAC3B,GAAG,CAAC,EAAE,OAAO,CAAC;IACd,gCAAgC;IAChC,GAAG,CAAC,EAAE,GAAG,CAAC;IACV,gDAAgD;IAChD,YAAY,CAAC,EAAE,aAAa,CAAC;IAC7B,2BAA2B;IAC3B,iBAAiB,CAAC,EAAE,iBAAiB,CAAC;IACtC,qBAAqB,CAAC,EAAE,4BAA4B,CAAC;IACrD,iDAAiD;IACjD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,iDAAiD;IACjD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,qCAAqC;IACrC,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,iCAAiC;IACjC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,oCAAoC;IACpC,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,yDAAyD;IACzD,oBAAoB,CAAC,EAAE,MAAM,eAAe,GAAG,SAAS,CAAC;CAC1D;AAED;;;GAGG;AACH,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,MAAM,GAAG,SAAS,CAAC;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,eAAe,CAAC;CACvB;AAGD,qBAAa,gBAAgB;;IAC3B,OAAO,CAAC,KAAK,CAAmB;IAChC,OAAO,CAAC,GAAG,CAAC,CAAU;IACtB,OAAO,CAAC,GAAG,CAAC,CAAM;IAClB,OAAO,CAAC,YAAY,CAAC,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAC,CAAoB;IAC9C,OAAO,CAAC,mBAAmB,CAAS;IACpC,OAAO,CAAC,mBAAmB,CAAS;IACpC,OAAO,CAAC,YAAY,CAAC,CAAS;IAC9B,OAAO,CAAC,eAAe,CAAC,CAAU;IAClC,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAC,CAAS;IAC7B,OAAO,CAAC,oBAAoB,CAAC,CAAoC;IAEjE,OAAO,CAAC,mBAAmB,CAAqC;IAChE,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,uBAAuB,CAAK;IACpC,OAAO,CAAC,eAAe,CAAM;IAC7B,OAAO,CAAC,sBAAsB,CAAM;IACpC,OAAO,CAAC,wBAAwB,CAAM;IACtC,OAAO,CAAC,yBAAyB,CAAgB;IACjD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,eAAe,CAAqB;IAC5C,OAAO,CAAC,iBAAiB,CAAS;IAClC,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,UAAU,CAAC,CAAS;IAE5B,OAAO,CAAC,YAAY,CAAC,CAAO;IAE5B,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,qBAAqB,CAAuC;IACpE,OAAO,CAAC,kBAAkB,CAA0C;IAGpE,OAAO,CAAC,aAAa,CAAC,CAAa;IACnC,OAAO,CAAC,kBAAkB,CAAC,CAAa;IACxC,OAAO,CAAC,OAAO,CAAC,CAAa;IAC7B,OAAO,CAAC,OAAO,CAAC,CAAa;IAC7B,OAAO,CAAC,gBAAgB,CAAC,CAAa;IAGtC,OAAO,CAAC,qBAAqB,CAAC,CAA+B;IAC7D,OAAO,CAAC,eAAe,CAAC,CAAS;IACjC,OAAO,CAAC,yBAAyB,CAAC,CAAS;IAC3C,OAAO,CAAC,gBAAgB,CAAgB;IACxC,OAAO,CAAC,qBAAqB,CAAU;IACvC,OAAO,CAAC,eAAe,CAAU;IACjC,OAAO,CAAC,yBAAyB,CAAC,CAAmD;gBAEzE,IAAI,EAAE,uBAAuB;IAyCzC;;OAEG;IACH,IAAI,iBAAiB,IAAI,MAAM,CAK9B;IAED,gBAAgB;IAChB,IAAI,cAAc,uBAEjB;IAED,gBAAgB;IAChB,aAAa,CAAC,OAAO,EAAE;QAAE,aAAa,EAAE,iBAAiB,GAAG,SAAS,CAAA;KAAE,GAAG,IAAI;IAIxE,KAAK;IAmBL,IAAI;IAMJ,oBAAoB;IAKpB,kBAAkB,CAAC,yBAAyB,EAAE,MAAM;IA4B1D,qFAAqF;IAC/E,sBAAsB,CAAC,cAAc,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,gBAAgB,CAAC,EAAE,IAAI;IAY/F,2DAA2D;IACrD,oBAAoB,CAAC,OAAO,EAAE,MAAM,EAAE,gBAAgB,CAAC,EAAE,IAAI;IAWnE;;;OAGG;YACW,oBAAoB;IA4ElC,OAAO,CAAC,kBAAkB;YAkCZ,2BAA2B;IAoBzC,OAAO,CAAC,kBAAkB;IA0B1B,OAAO,CAAC,eAAe;YAKT,UAAU;IAgOxB,OAAO,CAAC,oBAAoB;IAM5B,OAAO,CAAC,eAAe;YAmKT,aAAa;YAqDb,aAAa;YA4Eb,sBAAsB;IA+EpC,mBAAmB,CAAC,WAAW,EAAE,cAAc,CAAC,UAAU,CAAC;IAI3D,sBAAsB;IAItB,aAAa;IAeb,cAAc,CAAC,aAAa,EAAE,OAAO;IA8C/B,KAAK;IAWX,OAAO,CAAC,gBAAgB;IAuBxB,OAAO,KAAK,oBAAoB,GAU/B;CACF"}
|
|
@@ -5,14 +5,19 @@ import {
|
|
|
5
5
|
trace
|
|
6
6
|
} from "@opentelemetry/api";
|
|
7
7
|
import { ReadableStream } from "node:stream/web";
|
|
8
|
+
import { InterruptionDetectionError } from "../inference/interruption/errors.js";
|
|
9
|
+
import { InterruptionStreamSentinel } from "../inference/interruption/interruption_stream.js";
|
|
10
|
+
import {
|
|
11
|
+
} from "../inference/interruption/types.js";
|
|
8
12
|
import {} from "../llm/chat_context.js";
|
|
9
13
|
import { log } from "../log.js";
|
|
10
14
|
import { DeferredReadableStream, isStreamReaderReleaseError } from "../stream/deferred_stream.js";
|
|
11
15
|
import { IdentityTransform } from "../stream/identity_transform.js";
|
|
12
16
|
import { mergeReadableStreams } from "../stream/merge_readable_streams.js";
|
|
17
|
+
import { createStreamChannel } from "../stream/stream_channel.js";
|
|
13
18
|
import { SpeechEventType } from "../stt/stt.js";
|
|
14
19
|
import { traceTypes, tracer } from "../telemetry/index.js";
|
|
15
|
-
import { Task, delay } from "../utils.js";
|
|
20
|
+
import { Task, delay, waitForAbort } from "../utils.js";
|
|
16
21
|
import { VADEventType } from "../vad.js";
|
|
17
22
|
import { setParticipantSpanAttributes } from "./utils.js";
|
|
18
23
|
class AudioRecognition {
|
|
@@ -50,6 +55,15 @@ class AudioRecognition {
|
|
|
50
55
|
commitUserTurnTask;
|
|
51
56
|
vadTask;
|
|
52
57
|
sttTask;
|
|
58
|
+
interruptionTask;
|
|
59
|
+
// interruption detection
|
|
60
|
+
interruptionDetection;
|
|
61
|
+
_inputStartedAt;
|
|
62
|
+
ignoreUserTranscriptUntil;
|
|
63
|
+
transcriptBuffer;
|
|
64
|
+
isInterruptionEnabled;
|
|
65
|
+
isAgentSpeaking;
|
|
66
|
+
interruptionStreamChannel;
|
|
53
67
|
constructor(opts) {
|
|
54
68
|
this.hooks = opts.recognitionHooks;
|
|
55
69
|
this.stt = opts.stt;
|
|
@@ -64,9 +78,28 @@ class AudioRecognition {
|
|
|
64
78
|
this.sttProvider = opts.sttProvider;
|
|
65
79
|
this.getLinkedParticipant = opts.getLinkedParticipant;
|
|
66
80
|
this.deferredInputStream = new DeferredReadableStream();
|
|
67
|
-
|
|
68
|
-
this.
|
|
69
|
-
this.
|
|
81
|
+
this.interruptionDetection = opts.interruptionDetection;
|
|
82
|
+
this.transcriptBuffer = [];
|
|
83
|
+
this.isInterruptionEnabled = !!(opts.interruptionDetection && opts.vad);
|
|
84
|
+
this.isAgentSpeaking = false;
|
|
85
|
+
if (opts.interruptionDetection) {
|
|
86
|
+
const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee();
|
|
87
|
+
const [inputStream, sttInputStream] = teedInput.tee();
|
|
88
|
+
this.vadInputStream = vadInputStream;
|
|
89
|
+
this.sttInputStream = mergeReadableStreams(
|
|
90
|
+
sttInputStream,
|
|
91
|
+
this.silenceAudioTransform.readable
|
|
92
|
+
);
|
|
93
|
+
this.interruptionStreamChannel = createStreamChannel();
|
|
94
|
+
this.interruptionStreamChannel.addStreamInput(inputStream);
|
|
95
|
+
} else {
|
|
96
|
+
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
97
|
+
this.vadInputStream = vadInputStream;
|
|
98
|
+
this.sttInputStream = mergeReadableStreams(
|
|
99
|
+
sttInputStream,
|
|
100
|
+
this.silenceAudioTransform.readable
|
|
101
|
+
);
|
|
102
|
+
}
|
|
70
103
|
this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();
|
|
71
104
|
}
|
|
72
105
|
/**
|
|
@@ -78,6 +111,14 @@ class AudioRecognition {
|
|
|
78
111
|
}
|
|
79
112
|
return this.audioTranscript;
|
|
80
113
|
}
|
|
114
|
+
/** @internal */
|
|
115
|
+
get inputStartedAt() {
|
|
116
|
+
return this._inputStartedAt;
|
|
117
|
+
}
|
|
118
|
+
/** @internal */
|
|
119
|
+
updateOptions(options) {
|
|
120
|
+
this.turnDetectionMode = options.turnDetection;
|
|
121
|
+
}
|
|
81
122
|
async start() {
|
|
82
123
|
this.vadTask = Task.from(({ signal }) => this.createVadTask(this.vad, signal));
|
|
83
124
|
this.vadTask.result.catch((err) => {
|
|
@@ -87,6 +128,156 @@ class AudioRecognition {
|
|
|
87
128
|
this.sttTask.result.catch((err) => {
|
|
88
129
|
this.logger.error(`Error running STT task: ${err}`);
|
|
89
130
|
});
|
|
131
|
+
this.interruptionTask = Task.from(
|
|
132
|
+
({ signal }) => this.createInterruptionTask(this.interruptionDetection, signal)
|
|
133
|
+
);
|
|
134
|
+
this.interruptionTask.result.catch((err) => {
|
|
135
|
+
this.logger.error(`Error running interruption task: ${err}`);
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
async stop() {
|
|
139
|
+
var _a, _b, _c;
|
|
140
|
+
await ((_a = this.sttTask) == null ? void 0 : _a.cancelAndWait());
|
|
141
|
+
await ((_b = this.vadTask) == null ? void 0 : _b.cancelAndWait());
|
|
142
|
+
await ((_c = this.interruptionTask) == null ? void 0 : _c.cancelAndWait());
|
|
143
|
+
}
|
|
144
|
+
async onStartOfAgentSpeech() {
|
|
145
|
+
this.isAgentSpeaking = true;
|
|
146
|
+
return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted());
|
|
147
|
+
}
|
|
148
|
+
async onEndOfAgentSpeech(ignoreUserTranscriptUntil) {
|
|
149
|
+
if (!this.isInterruptionEnabled) {
|
|
150
|
+
this.isAgentSpeaking = false;
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
const inputOpen = await this.trySendInterruptionSentinel(
|
|
154
|
+
InterruptionStreamSentinel.agentSpeechEnded()
|
|
155
|
+
);
|
|
156
|
+
if (!inputOpen) {
|
|
157
|
+
this.isAgentSpeaking = false;
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
if (this.isAgentSpeaking) {
|
|
161
|
+
if (this.ignoreUserTranscriptUntil === void 0) {
|
|
162
|
+
this.onEndOfOverlapSpeech(Date.now());
|
|
163
|
+
}
|
|
164
|
+
this.ignoreUserTranscriptUntil = this.ignoreUserTranscriptUntil ? Math.min(ignoreUserTranscriptUntil, this.ignoreUserTranscriptUntil) : ignoreUserTranscriptUntil;
|
|
165
|
+
await this.flushHeldTranscripts();
|
|
166
|
+
}
|
|
167
|
+
this.isAgentSpeaking = false;
|
|
168
|
+
}
|
|
169
|
+
/** Start interruption inference when agent is speaking and overlap speech starts. */
|
|
170
|
+
async onStartOfOverlapSpeech(speechDuration, startedAt, userSpeakingSpan) {
|
|
171
|
+
if (this.isAgentSpeaking) {
|
|
172
|
+
this.trySendInterruptionSentinel(
|
|
173
|
+
InterruptionStreamSentinel.overlapSpeechStarted(
|
|
174
|
+
speechDuration,
|
|
175
|
+
startedAt,
|
|
176
|
+
userSpeakingSpan
|
|
177
|
+
)
|
|
178
|
+
);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
/** End interruption inference when overlap speech ends. */
|
|
182
|
+
async onEndOfOverlapSpeech(endedAt, userSpeakingSpan) {
|
|
183
|
+
if (!this.isInterruptionEnabled) {
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
if (userSpeakingSpan && userSpeakingSpan.isRecording()) {
|
|
187
|
+
userSpeakingSpan.setAttribute(traceTypes.ATTR_IS_INTERRUPTION, "false");
|
|
188
|
+
}
|
|
189
|
+
return this.trySendInterruptionSentinel(InterruptionStreamSentinel.overlapSpeechEnded(endedAt));
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp.
|
|
193
|
+
* If the event has no timestamps, we assume it is the same as the next valid event.
|
|
194
|
+
*/
|
|
195
|
+
async flushHeldTranscripts() {
|
|
196
|
+
if (!this.isInterruptionEnabled || this.ignoreUserTranscriptUntil === void 0 || this.transcriptBuffer.length === 0) {
|
|
197
|
+
return;
|
|
198
|
+
}
|
|
199
|
+
if (!this._inputStartedAt) {
|
|
200
|
+
this.transcriptBuffer = [];
|
|
201
|
+
this.ignoreUserTranscriptUntil = void 0;
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
let emitFromIndex = null;
|
|
205
|
+
let shouldFlush = false;
|
|
206
|
+
for (let i = 0; i < this.transcriptBuffer.length; i++) {
|
|
207
|
+
const ev = this.transcriptBuffer[i];
|
|
208
|
+
if (!ev || !ev.alternatives || ev.alternatives.length === 0) {
|
|
209
|
+
emitFromIndex = Math.min(emitFromIndex ?? i, i);
|
|
210
|
+
continue;
|
|
211
|
+
}
|
|
212
|
+
const firstAlternative = ev.alternatives[0];
|
|
213
|
+
if (firstAlternative.startTime === firstAlternative.endTime && firstAlternative.startTime === 0) {
|
|
214
|
+
this.transcriptBuffer = [];
|
|
215
|
+
this.ignoreUserTranscriptUntil = void 0;
|
|
216
|
+
return;
|
|
217
|
+
}
|
|
218
|
+
if (this.#alternativeEndsBeforeIgnoreWindow(firstAlternative)) {
|
|
219
|
+
emitFromIndex = null;
|
|
220
|
+
} else {
|
|
221
|
+
emitFromIndex = Math.min(emitFromIndex ?? i, i);
|
|
222
|
+
shouldFlush = true;
|
|
223
|
+
break;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
const eventsToEmit = emitFromIndex !== null && shouldFlush ? this.transcriptBuffer.slice(emitFromIndex) : [];
|
|
227
|
+
this.transcriptBuffer = [];
|
|
228
|
+
this.ignoreUserTranscriptUntil = void 0;
|
|
229
|
+
for (const event of eventsToEmit) {
|
|
230
|
+
this.logger.trace(
|
|
231
|
+
{
|
|
232
|
+
event: event.type
|
|
233
|
+
},
|
|
234
|
+
"re-emitting held user transcript"
|
|
235
|
+
);
|
|
236
|
+
this.onSTTEvent(event);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
#alternativeEndsBeforeIgnoreWindow(alternative) {
|
|
240
|
+
if (this.ignoreUserTranscriptUntil === void 0 || !this._inputStartedAt || alternative.startTime <= 0) {
|
|
241
|
+
return false;
|
|
242
|
+
}
|
|
243
|
+
return alternative.startTime * 1e3 + this._inputStartedAt < this.ignoreUserTranscriptUntil;
|
|
244
|
+
}
|
|
245
|
+
shouldHoldSttEvent(ev) {
|
|
246
|
+
if (!this.isInterruptionEnabled) {
|
|
247
|
+
return false;
|
|
248
|
+
}
|
|
249
|
+
if (this.isAgentSpeaking) {
|
|
250
|
+
return true;
|
|
251
|
+
}
|
|
252
|
+
if (ev.type === SpeechEventType.START_OF_SPEECH) {
|
|
253
|
+
this.ignoreUserTranscriptUntil = void 0;
|
|
254
|
+
this.transcriptBuffer = [];
|
|
255
|
+
return false;
|
|
256
|
+
}
|
|
257
|
+
if (this.ignoreUserTranscriptUntil === void 0) {
|
|
258
|
+
return false;
|
|
259
|
+
}
|
|
260
|
+
if (!ev.alternatives || ev.alternatives.length === 0) {
|
|
261
|
+
return true;
|
|
262
|
+
}
|
|
263
|
+
const alternative = ev.alternatives[0];
|
|
264
|
+
if (alternative.startTime !== alternative.endTime && this.#alternativeEndsBeforeIgnoreWindow(alternative)) {
|
|
265
|
+
return true;
|
|
266
|
+
}
|
|
267
|
+
return false;
|
|
268
|
+
}
|
|
269
|
+
async trySendInterruptionSentinel(frame) {
|
|
270
|
+
if (this.isInterruptionEnabled && this.interruptionStreamChannel && !this.interruptionStreamChannel.closed) {
|
|
271
|
+
try {
|
|
272
|
+
await this.interruptionStreamChannel.write(frame);
|
|
273
|
+
return true;
|
|
274
|
+
} catch (e) {
|
|
275
|
+
this.logger.warn(
|
|
276
|
+
`could not forward interruption sentinel: ${e instanceof Error ? e.message : String(e)}`
|
|
277
|
+
);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
return false;
|
|
90
281
|
}
|
|
91
282
|
ensureUserTurnSpan(startTime) {
|
|
92
283
|
var _a;
|
|
@@ -128,6 +319,18 @@ class AudioRecognition {
|
|
|
128
319
|
);
|
|
129
320
|
return;
|
|
130
321
|
}
|
|
322
|
+
if (ev.type !== SpeechEventType.RECOGNITION_USAGE && this.isInterruptionEnabled) {
|
|
323
|
+
if (this.shouldHoldSttEvent(ev)) {
|
|
324
|
+
this.logger.trace(
|
|
325
|
+
{ event: ev.type, ignoreUserTranscriptUntil: this.ignoreUserTranscriptUntil },
|
|
326
|
+
"holding STT event until ignore_user_transcript_until expires"
|
|
327
|
+
);
|
|
328
|
+
this.transcriptBuffer.push(ev);
|
|
329
|
+
return;
|
|
330
|
+
} else {
|
|
331
|
+
await this.flushHeldTranscripts();
|
|
332
|
+
}
|
|
333
|
+
}
|
|
131
334
|
switch (ev.type) {
|
|
132
335
|
case SpeechEventType.FINAL_TRANSCRIPT:
|
|
133
336
|
const transcript = (_c = (_b = ev.alternatives) == null ? void 0 : _b[0]) == null ? void 0 : _c.text;
|
|
@@ -272,6 +475,11 @@ class AudioRecognition {
|
|
|
272
475
|
}
|
|
273
476
|
}
|
|
274
477
|
}
|
|
478
|
+
onOverlapSpeechEvent(ev) {
|
|
479
|
+
if (ev.isInterruption) {
|
|
480
|
+
this.hooks.onInterruption(ev);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
275
483
|
runEOUDetection(chatCtx) {
|
|
276
484
|
var _a;
|
|
277
485
|
this.logger.debug(
|
|
@@ -467,7 +675,7 @@ class AudioRecognition {
|
|
|
467
675
|
if (ev.rawAccumulatedSpeech > 0) {
|
|
468
676
|
this.lastSpeakingTime = Date.now();
|
|
469
677
|
if (this.speechStartTime === void 0) {
|
|
470
|
-
this.speechStartTime = Date.now();
|
|
678
|
+
this.speechStartTime = Date.now() - ev.rawAccumulatedSpeech;
|
|
471
679
|
}
|
|
472
680
|
}
|
|
473
681
|
break;
|
|
@@ -492,6 +700,72 @@ class AudioRecognition {
|
|
|
492
700
|
this.logger.debug("VAD task closed");
|
|
493
701
|
}
|
|
494
702
|
}
|
|
703
|
+
async createInterruptionTask(interruptionDetection, signal) {
|
|
704
|
+
if (!interruptionDetection || !this.interruptionStreamChannel) return;
|
|
705
|
+
const stream = interruptionDetection.createStream();
|
|
706
|
+
const inputReader = this.interruptionStreamChannel.stream().getReader();
|
|
707
|
+
const cleanup = async () => {
|
|
708
|
+
try {
|
|
709
|
+
signal.removeEventListener("abort", abortHandler);
|
|
710
|
+
eventReader.releaseLock();
|
|
711
|
+
await stream.close();
|
|
712
|
+
} catch (e) {
|
|
713
|
+
this.logger.debug("createInterruptionTask: error during abort handler:", e);
|
|
714
|
+
}
|
|
715
|
+
};
|
|
716
|
+
const forwardTask = (async () => {
|
|
717
|
+
try {
|
|
718
|
+
const abortPromise = waitForAbort(signal);
|
|
719
|
+
while (!signal.aborted) {
|
|
720
|
+
const res = await Promise.race([inputReader.read(), abortPromise]);
|
|
721
|
+
if (!res) break;
|
|
722
|
+
const { value, done } = res;
|
|
723
|
+
if (done) break;
|
|
724
|
+
if (value instanceof AudioFrame) {
|
|
725
|
+
const frameDurationMs = value.samplesPerChannel / value.sampleRate * 1e3;
|
|
726
|
+
this._inputStartedAt ??= Date.now() - frameDurationMs;
|
|
727
|
+
} else {
|
|
728
|
+
this._inputStartedAt ??= Date.now();
|
|
729
|
+
}
|
|
730
|
+
await stream.pushFrame(value);
|
|
731
|
+
}
|
|
732
|
+
} finally {
|
|
733
|
+
inputReader.releaseLock();
|
|
734
|
+
}
|
|
735
|
+
})();
|
|
736
|
+
const eventReader = stream.stream().getReader();
|
|
737
|
+
const abortHandler = async () => {
|
|
738
|
+
await cleanup();
|
|
739
|
+
};
|
|
740
|
+
signal.addEventListener("abort", abortHandler);
|
|
741
|
+
try {
|
|
742
|
+
const abortPromise = waitForAbort(signal);
|
|
743
|
+
while (!signal.aborted) {
|
|
744
|
+
const res = await Promise.race([eventReader.read(), abortPromise]);
|
|
745
|
+
if (!res) break;
|
|
746
|
+
const { done, value: ev } = res;
|
|
747
|
+
if (done) break;
|
|
748
|
+
this.onOverlapSpeechEvent(ev);
|
|
749
|
+
}
|
|
750
|
+
} catch (e) {
|
|
751
|
+
if (!signal.aborted) {
|
|
752
|
+
const cause = e instanceof Error ? e : new Error(String(e));
|
|
753
|
+
interruptionDetection.emitError(
|
|
754
|
+
new InterruptionDetectionError(
|
|
755
|
+
cause.message,
|
|
756
|
+
Date.now(),
|
|
757
|
+
interruptionDetection.label,
|
|
758
|
+
false
|
|
759
|
+
)
|
|
760
|
+
);
|
|
761
|
+
this.logger.error(e, "Error in interruption task");
|
|
762
|
+
}
|
|
763
|
+
} finally {
|
|
764
|
+
await cleanup();
|
|
765
|
+
await forwardTask;
|
|
766
|
+
this.logger.debug("Interruption task closed");
|
|
767
|
+
}
|
|
768
|
+
}
|
|
495
769
|
setInputAudioStream(audioStream) {
|
|
496
770
|
this.deferredInputStream.setSource(audioStream);
|
|
497
771
|
}
|
|
@@ -546,13 +820,15 @@ class AudioRecognition {
|
|
|
546
820
|
});
|
|
547
821
|
}
|
|
548
822
|
async close() {
|
|
549
|
-
var _a, _b, _c, _d;
|
|
823
|
+
var _a, _b, _c, _d, _e, _f;
|
|
550
824
|
this.detachInputAudioStream();
|
|
551
825
|
this.silenceAudioWriter.releaseLock();
|
|
552
826
|
await ((_a = this.commitUserTurnTask) == null ? void 0 : _a.cancelAndWait());
|
|
553
827
|
await ((_b = this.sttTask) == null ? void 0 : _b.cancelAndWait());
|
|
554
828
|
await ((_c = this.vadTask) == null ? void 0 : _c.cancelAndWait());
|
|
555
829
|
await ((_d = this.bounceEOUTask) == null ? void 0 : _d.cancelAndWait());
|
|
830
|
+
await ((_e = this.interruptionTask) == null ? void 0 : _e.cancelAndWait());
|
|
831
|
+
await ((_f = this.interruptionStreamChannel) == null ? void 0 : _f.close());
|
|
556
832
|
}
|
|
557
833
|
_endUserTurnSpan({
|
|
558
834
|
transcript,
|
|
@@ -572,7 +848,13 @@ class AudioRecognition {
|
|
|
572
848
|
}
|
|
573
849
|
}
|
|
574
850
|
get vadBaseTurnDetection() {
|
|
575
|
-
|
|
851
|
+
if (typeof this.turnDetectionMode === "object") {
|
|
852
|
+
return false;
|
|
853
|
+
}
|
|
854
|
+
if (this.turnDetectionMode === void 0 || this.turnDetectionMode === "vad") {
|
|
855
|
+
return true;
|
|
856
|
+
}
|
|
857
|
+
return false;
|
|
576
858
|
}
|
|
577
859
|
}
|
|
578
860
|
export {
|