@livekit/agents 1.0.47 → 1.1.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +162 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +138 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/constants.cjs +27 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +9 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +18 -0
- package/dist/constants.js.map +1 -1
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.d.cts +12 -12
- package/dist/inference/api_protos.d.ts +12 -12
- package/dist/inference/interruption/defaults.cjs +81 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +19 -0
- package/dist/inference/interruption/defaults.d.ts +19 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +46 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +44 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +12 -0
- package/dist/inference/interruption/errors.d.ts +12 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +20 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +147 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +63 -0
- package/dist/inference/interruption/http_transport.d.ts +63 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +121 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
- package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
- package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_cache_entry.js +34 -0
- package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
- package/dist/inference/interruption/interruption_detector.cjs +181 -0
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
- package/dist/inference/interruption/interruption_detector.d.cts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts +59 -0
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_detector.js +147 -0
- package/dist/inference/interruption/interruption_detector.js.map +1 -0
- package/dist/inference/interruption/interruption_stream.cjs +368 -0
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
- package/dist/inference/interruption/interruption_stream.d.cts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts +46 -0
- package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
- package/dist/inference/interruption/interruption_stream.js +344 -0
- package/dist/inference/interruption/interruption_stream.js.map +1 -0
- package/dist/inference/interruption/types.cjs +17 -0
- package/dist/inference/interruption/types.cjs.map +1 -0
- package/dist/inference/interruption/types.d.cts +66 -0
- package/dist/inference/interruption/types.d.ts +66 -0
- package/dist/inference/interruption/types.d.ts.map +1 -0
- package/dist/inference/interruption/types.js +1 -0
- package/dist/inference/interruption/types.js.map +1 -0
- package/dist/inference/interruption/utils.cjs +130 -0
- package/dist/inference/interruption/utils.cjs.map +1 -0
- package/dist/inference/interruption/utils.d.cts +41 -0
- package/dist/inference/interruption/utils.d.ts +41 -0
- package/dist/inference/interruption/utils.d.ts.map +1 -0
- package/dist/inference/interruption/utils.js +105 -0
- package/dist/inference/interruption/utils.js.map +1 -0
- package/dist/inference/interruption/utils.test.cjs +105 -0
- package/dist/inference/interruption/utils.test.cjs.map +1 -0
- package/dist/inference/interruption/utils.test.js +104 -0
- package/dist/inference/interruption/utils.test.js.map +1 -0
- package/dist/inference/interruption/ws_transport.cjs +329 -0
- package/dist/inference/interruption/ws_transport.cjs.map +1 -0
- package/dist/inference/interruption/ws_transport.d.cts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts +33 -0
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
- package/dist/inference/interruption/ws_transport.js +295 -0
- package/dist/inference/interruption/ws_transport.js.map +1 -0
- package/dist/inference/llm.cjs +14 -10
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +2 -1
- package/dist/inference/llm.d.ts +2 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +8 -10
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +7 -2
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +2 -0
- package/dist/inference/stt.d.ts +2 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +8 -3
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +7 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -0
- package/dist/inference/tts.d.ts +2 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +8 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/utils.cjs +26 -7
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +13 -0
- package/dist/inference/utils.d.ts +13 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +18 -2
- package/dist/inference/utils.js.map +1 -1
- package/dist/llm/chat_context.cjs +108 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +28 -1
- package/dist/llm/chat_context.d.ts +28 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +108 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +43 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +43 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +2 -2
- package/dist/llm/index.d.ts +2 -2
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +3 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +16 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +9 -0
- package/dist/llm/llm.d.ts +9 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +16 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/realtime.cjs +3 -0
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js +3 -0
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/tool_context.cjs +7 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +10 -2
- package/dist/llm/tool_context.d.ts +10 -2
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +6 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +45 -1
- package/dist/metrics/base.d.ts +45 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/index.cjs +5 -0
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -1
- package/dist/metrics/index.d.ts +2 -1
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +6 -0
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/model_usage.cjs +189 -0
- package/dist/metrics/model_usage.cjs.map +1 -0
- package/dist/metrics/model_usage.d.cts +92 -0
- package/dist/metrics/model_usage.d.ts +92 -0
- package/dist/metrics/model_usage.d.ts.map +1 -0
- package/dist/metrics/model_usage.js +164 -0
- package/dist/metrics/model_usage.js.map +1 -0
- package/dist/metrics/model_usage.test.cjs +474 -0
- package/dist/metrics/model_usage.test.cjs.map +1 -0
- package/dist/metrics/model_usage.test.js +476 -0
- package/dist/metrics/model_usage.test.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +3 -0
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +9 -0
- package/dist/metrics/usage_collector.d.ts +9 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +3 -0
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +9 -0
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +9 -0
- package/dist/metrics/utils.js.map +1 -1
- package/dist/stream/multi_input_stream.test.cjs +4 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -1
- package/dist/stream/multi_input_stream.test.js +5 -1
- package/dist/stream/multi_input_stream.test.js.map +1 -1
- package/dist/stream/stream_channel.cjs +31 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +4 -2
- package/dist/stream/stream_channel.d.ts +4 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +31 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/stt/stt.cjs +34 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +22 -0
- package/dist/stt/stt.d.ts +22 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +34 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/otel_http_exporter.cjs +24 -5
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
- package/dist/telemetry/otel_http_exporter.d.cts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts +1 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
- package/dist/telemetry/otel_http_exporter.js +24 -5
- package/dist/telemetry/otel_http_exporter.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +5 -5
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +9 -5
- package/dist/telemetry/trace_types.d.ts +9 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +5 -5
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/telemetry/traces.cjs +47 -8
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +47 -8
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +64 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +34 -0
- package/dist/tts/tts.d.ts +34 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +64 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +1 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +1 -0
- package/dist/utils.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +34 -4
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +11 -2
- package/dist/voice/agent.d.ts +11 -2
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +34 -4
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +292 -44
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +27 -6
- package/dist/voice/agent_activity.d.ts +27 -6
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +293 -45
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -48
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +90 -20
- package/dist/voice/agent_session.d.ts +90 -20
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +105 -46
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +287 -6
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +42 -3
- package/dist/voice/audio_recognition.d.ts +42 -3
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +289 -7
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/client_events.cjs +554 -0
- package/dist/voice/client_events.cjs.map +1 -0
- package/dist/voice/client_events.d.cts +195 -0
- package/dist/voice/client_events.d.ts +195 -0
- package/dist/voice/client_events.d.ts.map +1 -0
- package/dist/voice/client_events.js +548 -0
- package/dist/voice/client_events.js.map +1 -0
- package/dist/voice/events.cjs +1 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +8 -5
- package/dist/voice/events.d.ts +8 -5
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +43 -8
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -3
- package/dist/voice/generation.d.ts +3 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/report.cjs +20 -8
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -0
- package/dist/voice/report.d.ts +5 -0
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +20 -8
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +106 -0
- package/dist/voice/report.test.cjs.map +1 -0
- package/dist/voice/report.test.js +105 -0
- package/dist/voice/report.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +16 -41
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +4 -9
- package/dist/voice/room_io/room_io.d.ts +4 -9
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +17 -43
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +3 -0
- package/dist/voice/testing/index.cjs.map +1 -1
- package/dist/voice/testing/index.d.cts +1 -0
- package/dist/voice/testing/index.d.ts +1 -0
- package/dist/voice/testing/index.d.ts.map +1 -1
- package/dist/voice/testing/index.js +2 -0
- package/dist/voice/testing/index.js.map +1 -1
- package/dist/voice/turn_config/endpointing.cjs +33 -0
- package/dist/voice/turn_config/endpointing.cjs.map +1 -0
- package/dist/voice/turn_config/endpointing.d.cts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts +30 -0
- package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
- package/dist/voice/turn_config/endpointing.js +9 -0
- package/dist/voice/turn_config/endpointing.js.map +1 -0
- package/dist/voice/turn_config/interruption.cjs +37 -0
- package/dist/voice/turn_config/interruption.cjs.map +1 -0
- package/dist/voice/turn_config/interruption.d.cts +53 -0
- package/dist/voice/turn_config/interruption.d.ts +53 -0
- package/dist/voice/turn_config/interruption.d.ts.map +1 -0
- package/dist/voice/turn_config/interruption.js +13 -0
- package/dist/voice/turn_config/interruption.js.map +1 -0
- package/dist/voice/turn_config/turn_handling.cjs +35 -0
- package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
- package/dist/voice/turn_config/turn_handling.d.cts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts +36 -0
- package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
- package/dist/voice/turn_config/turn_handling.js +11 -0
- package/dist/voice/turn_config/turn_handling.js.map +1 -0
- package/dist/voice/turn_config/utils.cjs +97 -0
- package/dist/voice/turn_config/utils.cjs.map +1 -0
- package/dist/voice/turn_config/utils.d.cts +25 -0
- package/dist/voice/turn_config/utils.d.ts +25 -0
- package/dist/voice/turn_config/utils.d.ts.map +1 -0
- package/dist/voice/turn_config/utils.js +73 -0
- package/dist/voice/turn_config/utils.js.map +1 -0
- package/dist/voice/turn_config/utils.test.cjs +86 -0
- package/dist/voice/turn_config/utils.test.cjs.map +1 -0
- package/dist/voice/turn_config/utils.test.js +85 -0
- package/dist/voice/turn_config/utils.test.js.map +1 -0
- package/dist/voice/wire_format.cjs +798 -0
- package/dist/voice/wire_format.cjs.map +1 -0
- package/dist/voice/wire_format.d.cts +5503 -0
- package/dist/voice/wire_format.d.ts +5503 -0
- package/dist/voice/wire_format.d.ts.map +1 -0
- package/dist/voice/wire_format.js +728 -0
- package/dist/voice/wire_format.js.map +1 -0
- package/package.json +2 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +194 -0
- package/src/constants.ts +13 -0
- package/src/index.ts +2 -1
- package/src/inference/interruption/defaults.ts +51 -0
- package/src/inference/interruption/errors.ts +25 -0
- package/src/inference/interruption/http_transport.ts +187 -0
- package/src/inference/interruption/interruption_cache_entry.ts +50 -0
- package/src/inference/interruption/interruption_detector.ts +188 -0
- package/src/inference/interruption/interruption_stream.ts +467 -0
- package/src/inference/interruption/types.ts +84 -0
- package/src/inference/interruption/utils.test.ts +132 -0
- package/src/inference/interruption/utils.ts +137 -0
- package/src/inference/interruption/ws_transport.ts +402 -0
- package/src/inference/llm.ts +9 -12
- package/src/inference/stt.ts +10 -3
- package/src/inference/tts.ts +10 -3
- package/src/inference/utils.ts +29 -1
- package/src/llm/chat_context.test.ts +48 -0
- package/src/llm/chat_context.ts +161 -0
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +16 -0
- package/src/llm/realtime.ts +4 -0
- package/src/llm/tool_context.ts +14 -0
- package/src/metrics/base.ts +48 -1
- package/src/metrics/index.ts +11 -0
- package/src/metrics/model_usage.test.ts +545 -0
- package/src/metrics/model_usage.ts +262 -0
- package/src/metrics/usage_collector.ts +11 -0
- package/src/metrics/utils.ts +11 -0
- package/src/stream/multi_input_stream.test.ts +6 -1
- package/src/stream/stream_channel.ts +34 -2
- package/src/stt/stt.ts +38 -0
- package/src/telemetry/otel_http_exporter.ts +28 -5
- package/src/telemetry/trace_types.ts +11 -8
- package/src/telemetry/traces.ts +111 -54
- package/src/tts/tts.ts +69 -1
- package/src/utils.ts +5 -0
- package/src/voice/agent.ts +41 -3
- package/src/voice/agent_activity.ts +371 -34
- package/src/voice/agent_session.ts +207 -59
- package/src/voice/audio_recognition.ts +385 -9
- package/src/voice/client_events.ts +838 -0
- package/src/voice/events.ts +14 -4
- package/src/voice/generation.ts +52 -9
- package/src/voice/index.ts +1 -0
- package/src/voice/report.test.ts +117 -0
- package/src/voice/report.ts +29 -6
- package/src/voice/room_io/room_io.ts +21 -64
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +2 -0
- package/src/voice/turn_config/endpointing.ts +33 -0
- package/src/voice/turn_config/interruption.ts +56 -0
- package/src/voice/turn_config/turn_handling.ts +45 -0
- package/src/voice/turn_config/utils.test.ts +100 -0
- package/src/voice/turn_config/utils.ts +103 -0
- package/src/voice/wire_format.ts +827 -0
|
@@ -12,14 +12,22 @@ import {
|
|
|
12
12
|
} from '@opentelemetry/api';
|
|
13
13
|
import type { WritableStreamDefaultWriter } from 'node:stream/web';
|
|
14
14
|
import { ReadableStream } from 'node:stream/web';
|
|
15
|
+
import { InterruptionDetectionError } from '../inference/interruption/errors.js';
|
|
16
|
+
import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
|
|
17
|
+
import { InterruptionStreamSentinel } from '../inference/interruption/interruption_stream.js';
|
|
18
|
+
import {
|
|
19
|
+
type InterruptionSentinel,
|
|
20
|
+
type OverlappingSpeechEvent,
|
|
21
|
+
} from '../inference/interruption/types.js';
|
|
15
22
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
16
23
|
import { log } from '../log.js';
|
|
17
24
|
import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';
|
|
18
25
|
import { IdentityTransform } from '../stream/identity_transform.js';
|
|
19
26
|
import { mergeReadableStreams } from '../stream/merge_readable_streams.js';
|
|
27
|
+
import { type StreamChannel, createStreamChannel } from '../stream/stream_channel.js';
|
|
20
28
|
import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
|
|
21
29
|
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
22
|
-
import { Task, delay } from '../utils.js';
|
|
30
|
+
import { Task, delay, waitForAbort } from '../utils.js';
|
|
23
31
|
import { type VAD, type VADEvent, VADEventType } from '../vad.js';
|
|
24
32
|
import type { TurnDetectionMode } from './agent_session.js';
|
|
25
33
|
import type { STTNode } from './io.js';
|
|
@@ -46,6 +54,7 @@ export interface PreemptiveGenerationInfo {
|
|
|
46
54
|
}
|
|
47
55
|
|
|
48
56
|
export interface RecognitionHooks {
|
|
57
|
+
onInterruption: (ev: OverlappingSpeechEvent) => void;
|
|
49
58
|
onStartOfSpeech: (ev: VADEvent) => void;
|
|
50
59
|
onVADInferenceDone: (ev: VADEvent) => void;
|
|
51
60
|
onEndOfSpeech: (ev: VADEvent) => void;
|
|
@@ -58,9 +67,13 @@ export interface RecognitionHooks {
|
|
|
58
67
|
}
|
|
59
68
|
|
|
60
69
|
export interface _TurnDetector {
|
|
70
|
+
/** The model name used by this turn detector. */
|
|
71
|
+
readonly model: string;
|
|
72
|
+
/** The provider name for this turn detector. */
|
|
73
|
+
readonly provider: string;
|
|
61
74
|
unlikelyThreshold: (language?: string) => Promise<number | undefined>;
|
|
62
75
|
supportsLanguage: (language?: string) => Promise<boolean>;
|
|
63
|
-
predictEndOfTurn(chatCtx: ChatContext): Promise<number>;
|
|
76
|
+
predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise<number>;
|
|
64
77
|
}
|
|
65
78
|
|
|
66
79
|
export interface AudioRecognitionOptions {
|
|
@@ -73,7 +86,8 @@ export interface AudioRecognitionOptions {
|
|
|
73
86
|
/** Turn detector for end-of-turn prediction. */
|
|
74
87
|
turnDetector?: _TurnDetector;
|
|
75
88
|
/** Turn detection mode. */
|
|
76
|
-
turnDetectionMode?:
|
|
89
|
+
turnDetectionMode?: TurnDetectionMode;
|
|
90
|
+
interruptionDetection?: AdaptiveInterruptionDetector;
|
|
77
91
|
/** Minimum endpointing delay in milliseconds. */
|
|
78
92
|
minEndpointingDelay: number;
|
|
79
93
|
/** Maximum endpointing delay in milliseconds. */
|
|
@@ -98,12 +112,13 @@ export interface ParticipantLike {
|
|
|
98
112
|
kind: ParticipantKind;
|
|
99
113
|
}
|
|
100
114
|
|
|
115
|
+
// TODO add ability to update stt/vad/interruption-detection
|
|
101
116
|
export class AudioRecognition {
|
|
102
117
|
private hooks: RecognitionHooks;
|
|
103
118
|
private stt?: STTNode;
|
|
104
119
|
private vad?: VAD;
|
|
105
120
|
private turnDetector?: _TurnDetector;
|
|
106
|
-
private turnDetectionMode?:
|
|
121
|
+
private turnDetectionMode?: TurnDetectionMode;
|
|
107
122
|
private minEndpointingDelay: number;
|
|
108
123
|
private maxEndpointingDelay: number;
|
|
109
124
|
private lastLanguage?: string;
|
|
@@ -137,6 +152,16 @@ export class AudioRecognition {
|
|
|
137
152
|
private commitUserTurnTask?: Task<void>;
|
|
138
153
|
private vadTask?: Task<void>;
|
|
139
154
|
private sttTask?: Task<void>;
|
|
155
|
+
private interruptionTask?: Task<void>;
|
|
156
|
+
|
|
157
|
+
// interruption detection
|
|
158
|
+
private interruptionDetection?: AdaptiveInterruptionDetector;
|
|
159
|
+
private _inputStartedAt?: number;
|
|
160
|
+
private ignoreUserTranscriptUntil?: number;
|
|
161
|
+
private transcriptBuffer: SpeechEvent[];
|
|
162
|
+
private isInterruptionEnabled: boolean;
|
|
163
|
+
private isAgentSpeaking: boolean;
|
|
164
|
+
private interruptionStreamChannel?: StreamChannel<InterruptionSentinel | AudioFrame>;
|
|
140
165
|
|
|
141
166
|
constructor(opts: AudioRecognitionOptions) {
|
|
142
167
|
this.hooks = opts.recognitionHooks;
|
|
@@ -153,9 +178,29 @@ export class AudioRecognition {
|
|
|
153
178
|
this.getLinkedParticipant = opts.getLinkedParticipant;
|
|
154
179
|
|
|
155
180
|
this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
|
|
156
|
-
|
|
157
|
-
this.
|
|
158
|
-
this.
|
|
181
|
+
this.interruptionDetection = opts.interruptionDetection;
|
|
182
|
+
this.transcriptBuffer = [];
|
|
183
|
+
this.isInterruptionEnabled = !!(opts.interruptionDetection && opts.vad);
|
|
184
|
+
this.isAgentSpeaking = false;
|
|
185
|
+
|
|
186
|
+
if (opts.interruptionDetection) {
|
|
187
|
+
const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee();
|
|
188
|
+
const [inputStream, sttInputStream] = teedInput.tee();
|
|
189
|
+
this.vadInputStream = vadInputStream;
|
|
190
|
+
this.sttInputStream = mergeReadableStreams(
|
|
191
|
+
sttInputStream,
|
|
192
|
+
this.silenceAudioTransform.readable,
|
|
193
|
+
);
|
|
194
|
+
this.interruptionStreamChannel = createStreamChannel();
|
|
195
|
+
this.interruptionStreamChannel.addStreamInput(inputStream);
|
|
196
|
+
} else {
|
|
197
|
+
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
198
|
+
this.vadInputStream = vadInputStream;
|
|
199
|
+
this.sttInputStream = mergeReadableStreams(
|
|
200
|
+
sttInputStream,
|
|
201
|
+
this.silenceAudioTransform.readable,
|
|
202
|
+
);
|
|
203
|
+
}
|
|
159
204
|
this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();
|
|
160
205
|
}
|
|
161
206
|
|
|
@@ -169,6 +214,16 @@ export class AudioRecognition {
|
|
|
169
214
|
return this.audioTranscript;
|
|
170
215
|
}
|
|
171
216
|
|
|
217
|
+
/** @internal */
|
|
218
|
+
get inputStartedAt() {
|
|
219
|
+
return this._inputStartedAt;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/** @internal */
|
|
223
|
+
updateOptions(options: { turnDetection: TurnDetectionMode | undefined }): void {
|
|
224
|
+
this.turnDetectionMode = options.turnDetection;
|
|
225
|
+
}
|
|
226
|
+
|
|
172
227
|
async start() {
|
|
173
228
|
this.vadTask = Task.from(({ signal }) => this.createVadTask(this.vad, signal));
|
|
174
229
|
this.vadTask.result.catch((err) => {
|
|
@@ -179,6 +234,211 @@ export class AudioRecognition {
|
|
|
179
234
|
this.sttTask.result.catch((err) => {
|
|
180
235
|
this.logger.error(`Error running STT task: ${err}`);
|
|
181
236
|
});
|
|
237
|
+
|
|
238
|
+
this.interruptionTask = Task.from(({ signal }) =>
|
|
239
|
+
this.createInterruptionTask(this.interruptionDetection, signal),
|
|
240
|
+
);
|
|
241
|
+
this.interruptionTask.result.catch((err) => {
|
|
242
|
+
this.logger.error(`Error running interruption task: ${err}`);
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
async stop() {
|
|
247
|
+
await this.sttTask?.cancelAndWait();
|
|
248
|
+
await this.vadTask?.cancelAndWait();
|
|
249
|
+
await this.interruptionTask?.cancelAndWait();
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
async onStartOfAgentSpeech() {
|
|
253
|
+
this.isAgentSpeaking = true;
|
|
254
|
+
return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted());
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) {
|
|
258
|
+
if (!this.isInterruptionEnabled) {
|
|
259
|
+
this.isAgentSpeaking = false;
|
|
260
|
+
return;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const inputOpen = await this.trySendInterruptionSentinel(
|
|
264
|
+
InterruptionStreamSentinel.agentSpeechEnded(),
|
|
265
|
+
);
|
|
266
|
+
if (!inputOpen) {
|
|
267
|
+
this.isAgentSpeaking = false;
|
|
268
|
+
return;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (this.isAgentSpeaking) {
|
|
272
|
+
if (this.ignoreUserTranscriptUntil === undefined) {
|
|
273
|
+
this.onEndOfOverlapSpeech(Date.now());
|
|
274
|
+
}
|
|
275
|
+
this.ignoreUserTranscriptUntil = this.ignoreUserTranscriptUntil
|
|
276
|
+
? Math.min(ignoreUserTranscriptUntil, this.ignoreUserTranscriptUntil)
|
|
277
|
+
: ignoreUserTranscriptUntil;
|
|
278
|
+
|
|
279
|
+
// flush held transcripts if possible
|
|
280
|
+
await this.flushHeldTranscripts();
|
|
281
|
+
}
|
|
282
|
+
this.isAgentSpeaking = false;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/** Start interruption inference when agent is speaking and overlap speech starts. */
|
|
286
|
+
async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) {
|
|
287
|
+
if (this.isAgentSpeaking) {
|
|
288
|
+
this.trySendInterruptionSentinel(
|
|
289
|
+
InterruptionStreamSentinel.overlapSpeechStarted(
|
|
290
|
+
speechDuration,
|
|
291
|
+
startedAt,
|
|
292
|
+
userSpeakingSpan,
|
|
293
|
+
),
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/** End interruption inference when overlap speech ends. */
|
|
299
|
+
async onEndOfOverlapSpeech(endedAt: number, userSpeakingSpan?: Span) {
|
|
300
|
+
if (!this.isInterruptionEnabled) {
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
if (userSpeakingSpan && userSpeakingSpan.isRecording()) {
|
|
304
|
+
userSpeakingSpan.setAttribute(traceTypes.ATTR_IS_INTERRUPTION, 'false');
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
return this.trySendInterruptionSentinel(InterruptionStreamSentinel.overlapSpeechEnded(endedAt));
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/**
|
|
311
|
+
* Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp.
|
|
312
|
+
* If the event has no timestamps, we assume it is the same as the next valid event.
|
|
313
|
+
*/
|
|
314
|
+
private async flushHeldTranscripts() {
|
|
315
|
+
if (
|
|
316
|
+
!this.isInterruptionEnabled ||
|
|
317
|
+
this.ignoreUserTranscriptUntil === undefined ||
|
|
318
|
+
this.transcriptBuffer.length === 0
|
|
319
|
+
) {
|
|
320
|
+
return;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if (!this._inputStartedAt) {
|
|
324
|
+
this.transcriptBuffer = [];
|
|
325
|
+
this.ignoreUserTranscriptUntil = undefined;
|
|
326
|
+
return;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
let emitFromIndex: number | null = null;
|
|
330
|
+
let shouldFlush = false;
|
|
331
|
+
|
|
332
|
+
for (let i = 0; i < this.transcriptBuffer.length; i++) {
|
|
333
|
+
const ev = this.transcriptBuffer[i];
|
|
334
|
+
if (!ev || !ev.alternatives || ev.alternatives.length === 0) {
|
|
335
|
+
emitFromIndex = Math.min(emitFromIndex ?? i, i);
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
338
|
+
const firstAlternative = ev.alternatives[0];
|
|
339
|
+
if (
|
|
340
|
+
firstAlternative.startTime === firstAlternative.endTime &&
|
|
341
|
+
firstAlternative.startTime === 0
|
|
342
|
+
) {
|
|
343
|
+
this.transcriptBuffer = [];
|
|
344
|
+
this.ignoreUserTranscriptUntil = undefined;
|
|
345
|
+
return;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
if (this.#alternativeEndsBeforeIgnoreWindow(firstAlternative)) {
|
|
349
|
+
emitFromIndex = null;
|
|
350
|
+
} else {
|
|
351
|
+
emitFromIndex = Math.min(emitFromIndex ?? i, i);
|
|
352
|
+
shouldFlush = true;
|
|
353
|
+
break;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
const eventsToEmit =
|
|
358
|
+
emitFromIndex !== null && shouldFlush ? this.transcriptBuffer.slice(emitFromIndex) : [];
|
|
359
|
+
|
|
360
|
+
this.transcriptBuffer = [];
|
|
361
|
+
this.ignoreUserTranscriptUntil = undefined;
|
|
362
|
+
|
|
363
|
+
for (const event of eventsToEmit) {
|
|
364
|
+
this.logger.trace(
|
|
365
|
+
{
|
|
366
|
+
event: event.type,
|
|
367
|
+
},
|
|
368
|
+
're-emitting held user transcript',
|
|
369
|
+
);
|
|
370
|
+
this.onSTTEvent(event);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
#alternativeEndsBeforeIgnoreWindow(
|
|
375
|
+
alternative: NonNullable<SpeechEvent['alternatives']>[number],
|
|
376
|
+
): boolean {
|
|
377
|
+
if (
|
|
378
|
+
this.ignoreUserTranscriptUntil === undefined ||
|
|
379
|
+
!this._inputStartedAt ||
|
|
380
|
+
alternative.startTime <= 0
|
|
381
|
+
) {
|
|
382
|
+
return false;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// `SpeechData.startTime` is in seconds relative to audio start, while `inputStartedAt` and
|
|
386
|
+
// `ignoreUserTranscriptUntil` are epoch milliseconds.
|
|
387
|
+
return alternative.startTime * 1000 + this._inputStartedAt < this.ignoreUserTranscriptUntil;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
private shouldHoldSttEvent(ev: SpeechEvent): boolean {
|
|
391
|
+
if (!this.isInterruptionEnabled) {
|
|
392
|
+
return false;
|
|
393
|
+
}
|
|
394
|
+
if (this.isAgentSpeaking) {
|
|
395
|
+
return true;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// reset when the user starts speaking after the agent speech
|
|
399
|
+
if (ev.type === SpeechEventType.START_OF_SPEECH) {
|
|
400
|
+
this.ignoreUserTranscriptUntil = undefined;
|
|
401
|
+
this.transcriptBuffer = [];
|
|
402
|
+
return false;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
if (this.ignoreUserTranscriptUntil === undefined) {
|
|
406
|
+
return false;
|
|
407
|
+
}
|
|
408
|
+
// sentinel events are always held until we have something concrete to release them
|
|
409
|
+
if (!ev.alternatives || ev.alternatives.length === 0) {
|
|
410
|
+
return true;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
const alternative = ev.alternatives[0];
|
|
414
|
+
|
|
415
|
+
if (
|
|
416
|
+
alternative.startTime !== alternative.endTime &&
|
|
417
|
+
this.#alternativeEndsBeforeIgnoreWindow(alternative)
|
|
418
|
+
) {
|
|
419
|
+
return true;
|
|
420
|
+
}
|
|
421
|
+
return false;
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
private async trySendInterruptionSentinel(
|
|
425
|
+
frame: AudioFrame | InterruptionSentinel,
|
|
426
|
+
): Promise<boolean> {
|
|
427
|
+
if (
|
|
428
|
+
this.isInterruptionEnabled &&
|
|
429
|
+
this.interruptionStreamChannel &&
|
|
430
|
+
!this.interruptionStreamChannel.closed
|
|
431
|
+
) {
|
|
432
|
+
try {
|
|
433
|
+
await this.interruptionStreamChannel.write(frame);
|
|
434
|
+
return true;
|
|
435
|
+
} catch (e: unknown) {
|
|
436
|
+
this.logger.warn(
|
|
437
|
+
`could not forward interruption sentinel: ${e instanceof Error ? e.message : String(e)}`,
|
|
438
|
+
);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
return false;
|
|
182
442
|
}
|
|
183
443
|
|
|
184
444
|
private ensureUserTurnSpan(startTime?: number): Span {
|
|
@@ -234,6 +494,25 @@ export class AudioRecognition {
|
|
|
234
494
|
return;
|
|
235
495
|
}
|
|
236
496
|
|
|
497
|
+
// handle interruption detection
|
|
498
|
+
// - hold the event until the ignore_user_transcript_until expires
|
|
499
|
+
// - release only relevant events
|
|
500
|
+
// - allow RECOGNITION_USAGE to pass through immediately
|
|
501
|
+
|
|
502
|
+
if (ev.type !== SpeechEventType.RECOGNITION_USAGE && this.isInterruptionEnabled) {
|
|
503
|
+
if (this.shouldHoldSttEvent(ev)) {
|
|
504
|
+
this.logger.trace(
|
|
505
|
+
{ event: ev.type, ignoreUserTranscriptUntil: this.ignoreUserTranscriptUntil },
|
|
506
|
+
'holding STT event until ignore_user_transcript_until expires',
|
|
507
|
+
);
|
|
508
|
+
this.transcriptBuffer.push(ev);
|
|
509
|
+
return;
|
|
510
|
+
} else {
|
|
511
|
+
await this.flushHeldTranscripts();
|
|
512
|
+
// no return here to allow the new event to be processed normally
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
|
|
237
516
|
switch (ev.type) {
|
|
238
517
|
case SpeechEventType.FINAL_TRANSCRIPT:
|
|
239
518
|
const transcript = ev.alternatives?.[0]?.text;
|
|
@@ -417,6 +696,12 @@ export class AudioRecognition {
|
|
|
417
696
|
}
|
|
418
697
|
}
|
|
419
698
|
|
|
699
|
+
private onOverlapSpeechEvent(ev: OverlappingSpeechEvent) {
|
|
700
|
+
if (ev.isInterruption) {
|
|
701
|
+
this.hooks.onInterruption(ev);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
|
|
420
705
|
private runEOUDetection(chatCtx: ChatContext) {
|
|
421
706
|
this.logger.debug(
|
|
422
707
|
{
|
|
@@ -675,7 +960,9 @@ export class AudioRecognition {
|
|
|
675
960
|
this.lastSpeakingTime = Date.now();
|
|
676
961
|
|
|
677
962
|
if (this.speechStartTime === undefined) {
|
|
678
|
-
|
|
963
|
+
// Backdate speechStartTime to the actual start of accumulated speech.
|
|
964
|
+
// ev.rawAccumulatedSpeech is in ms (VADEvent durations are all ms in TS).
|
|
965
|
+
this.speechStartTime = Date.now() - ev.rawAccumulatedSpeech;
|
|
679
966
|
}
|
|
680
967
|
}
|
|
681
968
|
break;
|
|
@@ -707,6 +994,85 @@ export class AudioRecognition {
|
|
|
707
994
|
}
|
|
708
995
|
}
|
|
709
996
|
|
|
997
|
+
private async createInterruptionTask(
|
|
998
|
+
interruptionDetection: AdaptiveInterruptionDetector | undefined,
|
|
999
|
+
signal: AbortSignal,
|
|
1000
|
+
) {
|
|
1001
|
+
if (!interruptionDetection || !this.interruptionStreamChannel) return;
|
|
1002
|
+
|
|
1003
|
+
const stream = interruptionDetection.createStream();
|
|
1004
|
+
const inputReader = this.interruptionStreamChannel.stream().getReader();
|
|
1005
|
+
|
|
1006
|
+
const cleanup = async () => {
|
|
1007
|
+
try {
|
|
1008
|
+
signal.removeEventListener('abort', abortHandler);
|
|
1009
|
+
eventReader.releaseLock();
|
|
1010
|
+
await stream.close();
|
|
1011
|
+
} catch (e) {
|
|
1012
|
+
this.logger.debug('createInterruptionTask: error during abort handler:', e);
|
|
1013
|
+
}
|
|
1014
|
+
};
|
|
1015
|
+
|
|
1016
|
+
// Forward input frames/sentinels to the interruption stream
|
|
1017
|
+
const forwardTask = (async () => {
|
|
1018
|
+
try {
|
|
1019
|
+
const abortPromise = waitForAbort(signal);
|
|
1020
|
+
while (!signal.aborted) {
|
|
1021
|
+
const res = await Promise.race([inputReader.read(), abortPromise]);
|
|
1022
|
+
if (!res) break;
|
|
1023
|
+
const { value, done } = res;
|
|
1024
|
+
if (done) break;
|
|
1025
|
+
// Backdate to the actual start of the audio frame, not when it was received.
|
|
1026
|
+
if (value instanceof AudioFrame) {
|
|
1027
|
+
const frameDurationMs = (value.samplesPerChannel / value.sampleRate) * 1000;
|
|
1028
|
+
this._inputStartedAt ??= Date.now() - frameDurationMs;
|
|
1029
|
+
} else {
|
|
1030
|
+
this._inputStartedAt ??= Date.now();
|
|
1031
|
+
}
|
|
1032
|
+
await stream.pushFrame(value);
|
|
1033
|
+
}
|
|
1034
|
+
} finally {
|
|
1035
|
+
inputReader.releaseLock();
|
|
1036
|
+
}
|
|
1037
|
+
})();
|
|
1038
|
+
|
|
1039
|
+
// Read output events from the interruption stream
|
|
1040
|
+
const eventReader = stream.stream().getReader();
|
|
1041
|
+
const abortHandler = async () => {
|
|
1042
|
+
await cleanup();
|
|
1043
|
+
};
|
|
1044
|
+
signal.addEventListener('abort', abortHandler);
|
|
1045
|
+
|
|
1046
|
+
try {
|
|
1047
|
+
const abortPromise = waitForAbort(signal);
|
|
1048
|
+
|
|
1049
|
+
while (!signal.aborted) {
|
|
1050
|
+
const res = await Promise.race([eventReader.read(), abortPromise]);
|
|
1051
|
+
if (!res) break;
|
|
1052
|
+
const { done, value: ev } = res;
|
|
1053
|
+
if (done) break;
|
|
1054
|
+
this.onOverlapSpeechEvent(ev);
|
|
1055
|
+
}
|
|
1056
|
+
} catch (e) {
|
|
1057
|
+
if (!signal.aborted) {
|
|
1058
|
+
const cause = e instanceof Error ? e : new Error(String(e));
|
|
1059
|
+
interruptionDetection.emitError(
|
|
1060
|
+
new InterruptionDetectionError(
|
|
1061
|
+
cause.message,
|
|
1062
|
+
Date.now(),
|
|
1063
|
+
interruptionDetection.label,
|
|
1064
|
+
false,
|
|
1065
|
+
),
|
|
1066
|
+
);
|
|
1067
|
+
this.logger.error(e, 'Error in interruption task');
|
|
1068
|
+
}
|
|
1069
|
+
} finally {
|
|
1070
|
+
await cleanup();
|
|
1071
|
+
await forwardTask;
|
|
1072
|
+
this.logger.debug('Interruption task closed');
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
|
|
710
1076
|
setInputAudioStream(audioStream: ReadableStream<AudioFrame>) {
|
|
711
1077
|
this.deferredInputStream.setSource(audioStream);
|
|
712
1078
|
}
|
|
@@ -783,6 +1149,8 @@ export class AudioRecognition {
|
|
|
783
1149
|
await this.sttTask?.cancelAndWait();
|
|
784
1150
|
await this.vadTask?.cancelAndWait();
|
|
785
1151
|
await this.bounceEOUTask?.cancelAndWait();
|
|
1152
|
+
await this.interruptionTask?.cancelAndWait();
|
|
1153
|
+
await this.interruptionStreamChannel?.close();
|
|
786
1154
|
}
|
|
787
1155
|
|
|
788
1156
|
private _endUserTurnSpan({
|
|
@@ -809,6 +1177,14 @@ export class AudioRecognition {
|
|
|
809
1177
|
}
|
|
810
1178
|
|
|
811
1179
|
private get vadBaseTurnDetection() {
|
|
812
|
-
|
|
1180
|
+
if (typeof this.turnDetectionMode === 'object') {
|
|
1181
|
+
return false;
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
if (this.turnDetectionMode === undefined || this.turnDetectionMode === 'vad') {
|
|
1185
|
+
return true;
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
return false;
|
|
813
1189
|
}
|
|
814
1190
|
}
|