@livekit/agents 0.7.8 → 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_exceptions.cjs +109 -0
- package/dist/_exceptions.cjs.map +1 -0
- package/dist/_exceptions.d.cts +64 -0
- package/dist/_exceptions.d.ts +64 -0
- package/dist/_exceptions.d.ts.map +1 -0
- package/dist/_exceptions.js +80 -0
- package/dist/_exceptions.js.map +1 -0
- package/dist/audio.cjs +10 -3
- package/dist/audio.cjs.map +1 -1
- package/dist/audio.d.cts +2 -0
- package/dist/audio.d.ts +2 -0
- package/dist/audio.d.ts.map +1 -1
- package/dist/audio.js +8 -2
- package/dist/audio.js.map +1 -1
- package/dist/cli.cjs +25 -0
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +25 -0
- package/dist/cli.js.map +1 -1
- package/dist/constants.cjs +6 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +2 -0
- package/dist/constants.d.ts +2 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +4 -0
- package/dist/constants.js.map +1 -1
- package/dist/http_server.cjs.map +1 -1
- package/dist/http_server.d.cts +1 -0
- package/dist/http_server.d.ts +1 -0
- package/dist/http_server.d.ts.map +1 -1
- package/dist/http_server.js.map +1 -1
- package/dist/index.cjs +27 -20
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +13 -10
- package/dist/index.d.ts +13 -10
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +15 -11
- package/dist/index.js.map +1 -1
- package/dist/inference_runner.cjs +0 -1
- package/dist/inference_runner.cjs.map +1 -1
- package/dist/inference_runner.d.cts +2 -3
- package/dist/inference_runner.d.ts +2 -3
- package/dist/inference_runner.d.ts.map +1 -1
- package/dist/inference_runner.js +0 -1
- package/dist/inference_runner.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs +2 -2
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/inference_proc_executor.js +2 -2
- package/dist/ipc/inference_proc_executor.js.map +1 -1
- package/dist/ipc/job_executor.cjs.map +1 -1
- package/dist/ipc/job_executor.js.map +1 -1
- package/dist/ipc/job_proc_executor.cjs +1 -0
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.js +1 -0
- package/dist/ipc/job_proc_executor.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +1 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.d.cts +1 -1
- package/dist/ipc/supervised_proc.d.ts +1 -1
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/job.cjs +14 -2
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +8 -0
- package/dist/job.d.ts +8 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +12 -1
- package/dist/job.js.map +1 -1
- package/dist/llm/chat_context.cjs +332 -82
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +152 -48
- package/dist/llm/chat_context.d.ts +152 -48
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +327 -81
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +380 -0
- package/dist/llm/chat_context.test.cjs.map +1 -0
- package/dist/llm/chat_context.test.js +385 -0
- package/dist/llm/chat_context.test.js.map +1 -0
- package/dist/llm/index.cjs +37 -8
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +7 -3
- package/dist/llm/index.d.ts +7 -3
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +39 -9
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +98 -33
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +50 -24
- package/dist/llm/llm.d.ts +50 -24
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +99 -33
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/google.cjs +128 -0
- package/dist/llm/provider_format/google.cjs.map +1 -0
- package/dist/llm/provider_format/google.d.cts +6 -0
- package/dist/llm/provider_format/google.d.ts +6 -0
- package/dist/llm/provider_format/google.d.ts.map +1 -0
- package/dist/llm/provider_format/google.js +104 -0
- package/dist/llm/provider_format/google.js.map +1 -0
- package/dist/llm/provider_format/google.test.cjs +676 -0
- package/dist/llm/provider_format/google.test.cjs.map +1 -0
- package/dist/llm/provider_format/google.test.js +675 -0
- package/dist/llm/provider_format/google.test.js.map +1 -0
- package/dist/llm/provider_format/index.cjs +40 -0
- package/dist/llm/provider_format/index.cjs.map +1 -0
- package/dist/llm/provider_format/index.d.cts +4 -0
- package/dist/llm/provider_format/index.d.ts +4 -0
- package/dist/llm/provider_format/index.d.ts.map +1 -0
- package/dist/llm/provider_format/index.js +16 -0
- package/dist/llm/provider_format/index.js.map +1 -0
- package/dist/llm/provider_format/openai.cjs +116 -0
- package/dist/llm/provider_format/openai.cjs.map +1 -0
- package/dist/llm/provider_format/openai.d.cts +3 -0
- package/dist/llm/provider_format/openai.d.ts +3 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -0
- package/dist/llm/provider_format/openai.js +92 -0
- package/dist/llm/provider_format/openai.js.map +1 -0
- package/dist/llm/provider_format/openai.test.cjs +490 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -0
- package/dist/llm/provider_format/openai.test.js +489 -0
- package/dist/llm/provider_format/openai.test.js.map +1 -0
- package/dist/llm/provider_format/utils.cjs +146 -0
- package/dist/llm/provider_format/utils.cjs.map +1 -0
- package/dist/llm/provider_format/utils.d.cts +38 -0
- package/dist/llm/provider_format/utils.d.ts +38 -0
- package/dist/llm/provider_format/utils.d.ts.map +1 -0
- package/dist/llm/provider_format/utils.js +122 -0
- package/dist/llm/provider_format/utils.js.map +1 -0
- package/dist/llm/realtime.cjs +77 -0
- package/dist/llm/realtime.cjs.map +1 -0
- package/dist/llm/realtime.d.cts +98 -0
- package/dist/llm/realtime.d.ts +98 -0
- package/dist/llm/realtime.d.ts.map +1 -0
- package/dist/llm/realtime.js +52 -0
- package/dist/llm/realtime.js.map +1 -0
- package/dist/llm/remote_chat_context.cjs +112 -0
- package/dist/llm/remote_chat_context.cjs.map +1 -0
- package/dist/llm/remote_chat_context.d.cts +23 -0
- package/dist/llm/remote_chat_context.d.ts +23 -0
- package/dist/llm/remote_chat_context.d.ts.map +1 -0
- package/dist/llm/remote_chat_context.js +88 -0
- package/dist/llm/remote_chat_context.js.map +1 -0
- package/dist/llm/remote_chat_context.test.cjs +225 -0
- package/dist/llm/remote_chat_context.test.cjs.map +1 -0
- package/dist/llm/remote_chat_context.test.js +224 -0
- package/dist/llm/remote_chat_context.test.js.map +1 -0
- package/dist/llm/tool_context.cjs +111 -0
- package/dist/llm/tool_context.cjs.map +1 -0
- package/dist/llm/tool_context.d.cts +125 -0
- package/dist/llm/tool_context.d.ts +125 -0
- package/dist/llm/tool_context.d.ts.map +1 -0
- package/dist/llm/tool_context.js +80 -0
- package/dist/llm/tool_context.js.map +1 -0
- package/dist/llm/tool_context.test.cjs +162 -0
- package/dist/llm/tool_context.test.cjs.map +1 -0
- package/dist/llm/tool_context.test.js +161 -0
- package/dist/llm/tool_context.test.js.map +1 -0
- package/dist/llm/tool_context.type.test.cjs +92 -0
- package/dist/llm/tool_context.type.test.cjs.map +1 -0
- package/dist/llm/tool_context.type.test.js +91 -0
- package/dist/llm/tool_context.type.test.js.map +1 -0
- package/dist/llm/utils.cjs +260 -0
- package/dist/llm/utils.cjs.map +1 -0
- package/dist/llm/utils.d.cts +42 -0
- package/dist/llm/utils.d.ts +42 -0
- package/dist/llm/utils.d.ts.map +1 -0
- package/dist/llm/utils.js +223 -0
- package/dist/llm/utils.js.map +1 -0
- package/dist/llm/utils.test.cjs +513 -0
- package/dist/llm/utils.test.cjs.map +1 -0
- package/dist/llm/utils.test.js +490 -0
- package/dist/llm/utils.test.js.map +1 -0
- package/dist/metrics/base.cjs +0 -27
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +105 -63
- package/dist/metrics/base.d.ts +105 -63
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/base.js +0 -19
- package/dist/metrics/base.js.map +1 -1
- package/dist/metrics/index.cjs +0 -3
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +2 -3
- package/dist/metrics/index.d.ts +2 -3
- package/dist/metrics/index.d.ts.map +1 -1
- package/dist/metrics/index.js +0 -2
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/usage_collector.cjs +17 -12
- package/dist/metrics/usage_collector.cjs.map +1 -1
- package/dist/metrics/usage_collector.d.cts +3 -2
- package/dist/metrics/usage_collector.d.ts +3 -2
- package/dist/metrics/usage_collector.d.ts.map +1 -1
- package/dist/metrics/usage_collector.js +17 -12
- package/dist/metrics/usage_collector.js.map +1 -1
- package/dist/metrics/utils.cjs +22 -59
- package/dist/metrics/utils.cjs.map +1 -1
- package/dist/metrics/utils.d.cts +1 -8
- package/dist/metrics/utils.d.ts +1 -8
- package/dist/metrics/utils.d.ts.map +1 -1
- package/dist/metrics/utils.js +22 -52
- package/dist/metrics/utils.js.map +1 -1
- package/dist/multimodal/index.cjs +0 -2
- package/dist/multimodal/index.cjs.map +1 -1
- package/dist/multimodal/index.d.cts +0 -1
- package/dist/multimodal/index.d.ts +0 -1
- package/dist/multimodal/index.d.ts.map +1 -1
- package/dist/multimodal/index.js +0 -1
- package/dist/multimodal/index.js.map +1 -1
- package/dist/plugin.cjs +24 -8
- package/dist/plugin.cjs.map +1 -1
- package/dist/plugin.d.cts +18 -4
- package/dist/plugin.d.ts +18 -4
- package/dist/plugin.d.ts.map +1 -1
- package/dist/plugin.js +22 -7
- package/dist/plugin.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +98 -0
- package/dist/stream/deferred_stream.cjs.map +1 -0
- package/dist/stream/deferred_stream.d.cts +27 -0
- package/dist/stream/deferred_stream.d.ts +27 -0
- package/dist/stream/deferred_stream.d.ts.map +1 -0
- package/dist/stream/deferred_stream.js +73 -0
- package/dist/stream/deferred_stream.js.map +1 -0
- package/dist/stream/deferred_stream.test.cjs +527 -0
- package/dist/stream/deferred_stream.test.cjs.map +1 -0
- package/dist/stream/deferred_stream.test.js +526 -0
- package/dist/stream/deferred_stream.test.js.map +1 -0
- package/dist/stream/identity_transform.cjs +42 -0
- package/dist/stream/identity_transform.cjs.map +1 -0
- package/dist/stream/identity_transform.d.cts +6 -0
- package/dist/stream/identity_transform.d.ts +6 -0
- package/dist/stream/identity_transform.d.ts.map +1 -0
- package/dist/stream/identity_transform.js +18 -0
- package/dist/stream/identity_transform.js.map +1 -0
- package/dist/stream/identity_transform.test.cjs +125 -0
- package/dist/stream/identity_transform.test.cjs.map +1 -0
- package/dist/stream/identity_transform.test.js +124 -0
- package/dist/stream/identity_transform.test.js.map +1 -0
- package/dist/stream/index.cjs +38 -0
- package/dist/stream/index.cjs.map +1 -0
- package/dist/stream/index.d.cts +5 -0
- package/dist/stream/index.d.ts +5 -0
- package/dist/stream/index.d.ts.map +1 -0
- package/dist/stream/index.js +11 -0
- package/dist/stream/index.js.map +1 -0
- package/dist/stream/merge_readable_streams.cjs +59 -0
- package/dist/stream/merge_readable_streams.cjs.map +1 -0
- package/dist/stream/merge_readable_streams.d.cts +4 -0
- package/dist/stream/merge_readable_streams.d.ts +4 -0
- package/dist/stream/merge_readable_streams.d.ts.map +1 -0
- package/dist/stream/merge_readable_streams.js +35 -0
- package/dist/stream/merge_readable_streams.js.map +1 -0
- package/dist/stream/stream_channel.cjs +47 -0
- package/dist/stream/stream_channel.cjs.map +1 -0
- package/dist/stream/stream_channel.d.cts +9 -0
- package/dist/stream/stream_channel.d.ts +9 -0
- package/dist/stream/stream_channel.d.ts.map +1 -0
- package/dist/stream/stream_channel.js +23 -0
- package/dist/stream/stream_channel.js.map +1 -0
- package/dist/stream/stream_channel.test.cjs +97 -0
- package/dist/stream/stream_channel.test.cjs.map +1 -0
- package/dist/stream/stream_channel.test.js +96 -0
- package/dist/stream/stream_channel.test.js.map +1 -0
- package/dist/stt/stream_adapter.cjs +3 -4
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.cts +1 -0
- package/dist/stt/stream_adapter.d.ts +1 -0
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +3 -4
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +101 -10
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +26 -5
- package/dist/stt/stt.d.ts +26 -5
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +102 -11
- package/dist/stt/stt.js.map +1 -1
- package/dist/tokenize/basic/basic.cjs +10 -5
- package/dist/tokenize/basic/basic.cjs.map +1 -1
- package/dist/tokenize/basic/basic.d.cts +7 -1
- package/dist/tokenize/basic/basic.d.ts +7 -1
- package/dist/tokenize/basic/basic.d.ts.map +1 -1
- package/dist/tokenize/basic/basic.js +10 -5
- package/dist/tokenize/basic/basic.js.map +1 -1
- package/dist/tokenize/basic/sentence.cjs +14 -6
- package/dist/tokenize/basic/sentence.cjs.map +1 -1
- package/dist/tokenize/basic/sentence.d.cts +1 -1
- package/dist/tokenize/basic/sentence.d.ts +1 -1
- package/dist/tokenize/basic/sentence.d.ts.map +1 -1
- package/dist/tokenize/basic/sentence.js +14 -6
- package/dist/tokenize/basic/sentence.js.map +1 -1
- package/dist/tokenize/token_stream.cjs +5 -3
- package/dist/tokenize/token_stream.cjs.map +1 -1
- package/dist/tokenize/token_stream.d.cts +1 -0
- package/dist/tokenize/token_stream.d.ts +1 -0
- package/dist/tokenize/token_stream.d.ts.map +1 -1
- package/dist/tokenize/token_stream.js +6 -4
- package/dist/tokenize/token_stream.js.map +1 -1
- package/dist/transcription.cjs +1 -2
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js +2 -3
- package/dist/transcription.js.map +1 -1
- package/dist/tts/index.cjs +2 -4
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +1 -1
- package/dist/tts/index.d.ts +1 -1
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +1 -3
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +26 -13
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +1 -1
- package/dist/tts/stream_adapter.d.ts +1 -1
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +27 -14
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +157 -25
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +29 -5
- package/dist/tts/tts.d.ts +29 -5
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +157 -24
- package/dist/tts/tts.js.map +1 -1
- package/dist/types.cjs +60 -0
- package/dist/types.cjs.map +1 -0
- package/dist/types.d.cts +13 -0
- package/dist/types.d.ts +13 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +35 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.cjs +281 -27
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +134 -9
- package/dist/utils.d.ts +134 -9
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +265 -26
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +492 -0
- package/dist/utils.test.cjs.map +1 -0
- package/dist/utils.test.js +498 -0
- package/dist/utils.test.js.map +1 -0
- package/dist/vad.cjs +76 -20
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +25 -5
- package/dist/vad.d.ts +25 -5
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +76 -20
- package/dist/vad.js.map +1 -1
- package/dist/voice/agent.cjs +245 -0
- package/dist/voice/agent.cjs.map +1 -0
- package/dist/voice/agent.d.cts +78 -0
- package/dist/voice/agent.d.ts +78 -0
- package/dist/voice/agent.d.ts.map +1 -0
- package/dist/voice/agent.js +220 -0
- package/dist/voice/agent.js.map +1 -0
- package/dist/voice/agent.test.cjs +61 -0
- package/dist/voice/agent.test.cjs.map +1 -0
- package/dist/voice/agent.test.js +60 -0
- package/dist/voice/agent.test.js.map +1 -0
- package/dist/voice/agent_activity.cjs +1453 -0
- package/dist/voice/agent_activity.cjs.map +1 -0
- package/dist/voice/agent_activity.d.cts +94 -0
- package/dist/voice/agent_activity.d.ts +94 -0
- package/dist/voice/agent_activity.d.ts.map +1 -0
- package/dist/voice/agent_activity.js +1449 -0
- package/dist/voice/agent_activity.js.map +1 -0
- package/dist/voice/agent_session.cjs +312 -0
- package/dist/voice/agent_session.cjs.map +1 -0
- package/dist/voice/agent_session.d.cts +121 -0
- package/dist/voice/agent_session.d.ts +121 -0
- package/dist/voice/agent_session.d.ts.map +1 -0
- package/dist/voice/agent_session.js +295 -0
- package/dist/voice/agent_session.js.map +1 -0
- package/dist/voice/audio_recognition.cjs +375 -0
- package/dist/voice/audio_recognition.cjs.map +1 -0
- package/dist/voice/audio_recognition.d.cts +80 -0
- package/dist/voice/audio_recognition.d.ts +80 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -0
- package/dist/voice/audio_recognition.js +351 -0
- package/dist/voice/audio_recognition.js.map +1 -0
- package/dist/voice/events.cjs +145 -0
- package/dist/voice/events.cjs.map +1 -0
- package/dist/voice/events.d.cts +124 -0
- package/dist/voice/events.d.ts +124 -0
- package/dist/voice/events.d.ts.map +1 -0
- package/dist/voice/events.js +110 -0
- package/dist/voice/events.js.map +1 -0
- package/dist/voice/generation.cjs +700 -0
- package/dist/voice/generation.cjs.map +1 -0
- package/dist/voice/generation.d.cts +115 -0
- package/dist/voice/generation.d.ts +115 -0
- package/dist/voice/generation.d.ts.map +1 -0
- package/dist/voice/generation.js +672 -0
- package/dist/voice/generation.js.map +1 -0
- package/dist/voice/index.cjs +40 -0
- package/dist/voice/index.cjs.map +1 -0
- package/dist/voice/index.d.cts +5 -0
- package/dist/voice/index.d.ts +5 -0
- package/dist/voice/index.d.ts.map +1 -0
- package/dist/voice/index.js +11 -0
- package/dist/voice/index.js.map +1 -0
- package/dist/voice/io.cjs +245 -0
- package/dist/voice/io.cjs.map +1 -0
- package/dist/voice/io.d.cts +101 -0
- package/dist/voice/io.d.ts +101 -0
- package/dist/voice/io.d.ts.map +1 -0
- package/dist/voice/io.js +217 -0
- package/dist/voice/io.js.map +1 -0
- package/dist/voice/room_io/_input.cjs +121 -0
- package/dist/voice/room_io/_input.cjs.map +1 -0
- package/dist/voice/room_io/_input.d.cts +24 -0
- package/dist/voice/room_io/_input.d.ts +24 -0
- package/dist/voice/room_io/_input.d.ts.map +1 -0
- package/dist/voice/room_io/_input.js +102 -0
- package/dist/voice/room_io/_input.js.map +1 -0
- package/dist/voice/room_io/_output.cjs +358 -0
- package/dist/voice/room_io/_output.cjs.map +1 -0
- package/dist/voice/room_io/_output.d.cts +75 -0
- package/dist/voice/room_io/_output.d.ts +75 -0
- package/dist/voice/room_io/_output.d.ts.map +1 -0
- package/dist/voice/room_io/_output.js +342 -0
- package/dist/voice/room_io/_output.js.map +1 -0
- package/dist/voice/room_io/index.cjs +25 -0
- package/dist/voice/room_io/index.cjs.map +1 -0
- package/dist/voice/room_io/index.d.cts +3 -0
- package/dist/voice/room_io/index.d.ts +3 -0
- package/dist/voice/room_io/index.d.ts.map +1 -0
- package/dist/voice/room_io/index.js +3 -0
- package/dist/voice/room_io/index.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +370 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -0
- package/dist/voice/room_io/room_io.d.cts +73 -0
- package/dist/voice/room_io/room_io.d.ts +73 -0
- package/dist/voice/room_io/room_io.d.ts.map +1 -0
- package/dist/voice/room_io/room_io.js +361 -0
- package/dist/voice/room_io/room_io.js.map +1 -0
- package/dist/{pipeline/index.cjs → voice/run_context.cjs} +16 -11
- package/dist/voice/run_context.cjs.map +1 -0
- package/dist/voice/run_context.d.cts +12 -0
- package/dist/voice/run_context.d.ts +12 -0
- package/dist/voice/run_context.d.ts.map +1 -0
- package/dist/voice/run_context.js +14 -0
- package/dist/voice/run_context.js.map +1 -0
- package/dist/voice/speech_handle.cjs +105 -0
- package/dist/voice/speech_handle.cjs.map +1 -0
- package/dist/voice/speech_handle.d.cts +46 -0
- package/dist/voice/speech_handle.d.ts +46 -0
- package/dist/voice/speech_handle.d.ts.map +1 -0
- package/dist/voice/speech_handle.js +81 -0
- package/dist/voice/speech_handle.js.map +1 -0
- package/dist/voice/transcription/_utils.cjs +45 -0
- package/dist/voice/transcription/_utils.cjs.map +1 -0
- package/dist/voice/transcription/_utils.d.cts +3 -0
- package/dist/voice/transcription/_utils.d.ts +3 -0
- package/dist/voice/transcription/_utils.d.ts.map +1 -0
- package/dist/voice/transcription/_utils.js +21 -0
- package/dist/voice/transcription/_utils.js.map +1 -0
- package/dist/voice/transcription/index.cjs +23 -0
- package/dist/voice/transcription/index.cjs.map +1 -0
- package/dist/voice/transcription/index.d.cts +2 -0
- package/dist/voice/transcription/index.d.ts +2 -0
- package/dist/voice/transcription/index.d.ts.map +1 -0
- package/dist/voice/transcription/index.js +2 -0
- package/dist/voice/transcription/index.js.map +1 -0
- package/dist/voice/transcription/synchronizer.cjs +380 -0
- package/dist/voice/transcription/synchronizer.cjs.map +1 -0
- package/dist/voice/transcription/synchronizer.d.cts +86 -0
- package/dist/voice/transcription/synchronizer.d.ts +86 -0
- package/dist/voice/transcription/synchronizer.d.ts.map +1 -0
- package/dist/voice/transcription/synchronizer.js +355 -0
- package/dist/voice/transcription/synchronizer.js.map +1 -0
- package/dist/worker.cjs +22 -4
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +1 -1
- package/dist/worker.d.ts +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +22 -4
- package/dist/worker.js.map +1 -1
- package/package.json +9 -2
- package/src/_exceptions.ts +137 -0
- package/src/audio.ts +12 -1
- package/src/cli.ts +37 -0
- package/src/constants.ts +2 -0
- package/src/http_server.ts +1 -0
- package/src/index.ts +13 -10
- package/src/inference_runner.ts +2 -3
- package/src/ipc/inference_proc_executor.ts +2 -2
- package/src/ipc/job_executor.ts +1 -1
- package/src/ipc/job_proc_executor.ts +1 -1
- package/src/ipc/job_proc_lazy_main.ts +1 -1
- package/src/job.ts +18 -0
- package/src/llm/__snapshots__/chat_context.test.ts.snap +527 -0
- package/src/llm/__snapshots__/tool_context.test.ts.snap +177 -0
- package/src/llm/__snapshots__/utils.test.ts.snap +65 -0
- package/src/llm/chat_context.test.ts +450 -0
- package/src/llm/chat_context.ts +501 -103
- package/src/llm/index.ts +53 -18
- package/src/llm/llm.ts +149 -50
- package/src/llm/provider_format/google.test.ts +772 -0
- package/src/llm/provider_format/google.ts +130 -0
- package/src/llm/provider_format/index.ts +23 -0
- package/src/llm/provider_format/openai.test.ts +581 -0
- package/src/llm/provider_format/openai.ts +118 -0
- package/src/llm/provider_format/utils.ts +183 -0
- package/src/llm/realtime.ts +151 -0
- package/src/llm/remote_chat_context.test.ts +290 -0
- package/src/llm/remote_chat_context.ts +114 -0
- package/src/llm/tool_context.test.ts +198 -0
- package/src/llm/tool_context.ts +259 -0
- package/src/llm/tool_context.type.test.ts +115 -0
- package/src/llm/utils.test.ts +670 -0
- package/src/llm/utils.ts +324 -0
- package/src/metrics/base.ts +110 -78
- package/src/metrics/index.ts +3 -9
- package/src/metrics/usage_collector.ts +19 -13
- package/src/metrics/utils.ts +24 -69
- package/src/multimodal/index.ts +0 -1
- package/src/plugin.ts +26 -8
- package/src/stream/deferred_stream.test.ts +755 -0
- package/src/stream/deferred_stream.ts +110 -0
- package/src/stream/identity_transform.test.ts +179 -0
- package/src/stream/identity_transform.ts +18 -0
- package/src/stream/index.ts +7 -0
- package/src/stream/merge_readable_streams.ts +40 -0
- package/src/stream/stream_channel.test.ts +129 -0
- package/src/stream/stream_channel.ts +32 -0
- package/src/stt/stream_adapter.ts +3 -5
- package/src/stt/stt.ts +135 -17
- package/src/tokenize/basic/basic.ts +13 -5
- package/src/tokenize/basic/sentence.ts +20 -6
- package/src/tokenize/token_stream.ts +7 -4
- package/src/transcription.ts +2 -3
- package/src/tts/index.ts +0 -1
- package/src/tts/stream_adapter.ts +42 -16
- package/src/tts/tts.ts +203 -21
- package/src/types.ts +42 -0
- package/src/utils.test.ts +658 -0
- package/src/utils.ts +375 -44
- package/src/vad.ts +90 -22
- package/src/voice/agent.test.ts +80 -0
- package/src/voice/agent.ts +332 -0
- package/src/voice/agent_activity.ts +1913 -0
- package/src/voice/agent_session.ts +460 -0
- package/src/voice/audio_recognition.ts +474 -0
- package/src/voice/events.ts +252 -0
- package/src/voice/generation.ts +881 -0
- package/src/voice/index.ts +7 -0
- package/src/voice/io.ts +304 -0
- package/src/voice/room_io/_input.ts +144 -0
- package/src/voice/room_io/_output.ts +436 -0
- package/src/voice/room_io/index.ts +5 -0
- package/src/voice/room_io/room_io.ts +495 -0
- package/src/voice/run_context.ts +20 -0
- package/src/voice/speech_handle.ts +104 -0
- package/src/voice/transcription/_utils.ts +25 -0
- package/src/voice/transcription/index.ts +4 -0
- package/src/voice/transcription/synchronizer.ts +478 -0
- package/src/worker.ts +22 -2
- package/dist/llm/function_context.cjs +0 -103
- package/dist/llm/function_context.cjs.map +0 -1
- package/dist/llm/function_context.d.cts +0 -47
- package/dist/llm/function_context.d.ts +0 -47
- package/dist/llm/function_context.d.ts.map +0 -1
- package/dist/llm/function_context.js +0 -78
- package/dist/llm/function_context.js.map +0 -1
- package/dist/llm/function_context.test.cjs +0 -218
- package/dist/llm/function_context.test.cjs.map +0 -1
- package/dist/llm/function_context.test.js +0 -217
- package/dist/llm/function_context.test.js.map +0 -1
- package/dist/multimodal/multimodal_agent.cjs +0 -451
- package/dist/multimodal/multimodal_agent.cjs.map +0 -1
- package/dist/multimodal/multimodal_agent.d.cts +0 -48
- package/dist/multimodal/multimodal_agent.d.ts +0 -48
- package/dist/multimodal/multimodal_agent.d.ts.map +0 -1
- package/dist/multimodal/multimodal_agent.js +0 -425
- package/dist/multimodal/multimodal_agent.js.map +0 -1
- package/dist/pipeline/agent_output.cjs +0 -197
- package/dist/pipeline/agent_output.cjs.map +0 -1
- package/dist/pipeline/agent_output.d.cts +0 -33
- package/dist/pipeline/agent_output.d.ts +0 -33
- package/dist/pipeline/agent_output.d.ts.map +0 -1
- package/dist/pipeline/agent_output.js +0 -172
- package/dist/pipeline/agent_output.js.map +0 -1
- package/dist/pipeline/agent_playout.cjs +0 -175
- package/dist/pipeline/agent_playout.cjs.map +0 -1
- package/dist/pipeline/agent_playout.d.cts +0 -40
- package/dist/pipeline/agent_playout.d.ts +0 -40
- package/dist/pipeline/agent_playout.d.ts.map +0 -1
- package/dist/pipeline/agent_playout.js +0 -139
- package/dist/pipeline/agent_playout.js.map +0 -1
- package/dist/pipeline/human_input.cjs +0 -171
- package/dist/pipeline/human_input.cjs.map +0 -1
- package/dist/pipeline/human_input.d.cts +0 -30
- package/dist/pipeline/human_input.d.ts +0 -30
- package/dist/pipeline/human_input.d.ts.map +0 -1
- package/dist/pipeline/human_input.js +0 -146
- package/dist/pipeline/human_input.js.map +0 -1
- package/dist/pipeline/index.cjs.map +0 -1
- package/dist/pipeline/index.d.cts +0 -2
- package/dist/pipeline/index.d.ts +0 -2
- package/dist/pipeline/index.d.ts.map +0 -1
- package/dist/pipeline/index.js +0 -11
- package/dist/pipeline/index.js.map +0 -1
- package/dist/pipeline/pipeline_agent.cjs +0 -849
- package/dist/pipeline/pipeline_agent.cjs.map +0 -1
- package/dist/pipeline/pipeline_agent.d.cts +0 -150
- package/dist/pipeline/pipeline_agent.d.ts +0 -150
- package/dist/pipeline/pipeline_agent.d.ts.map +0 -1
- package/dist/pipeline/pipeline_agent.js +0 -826
- package/dist/pipeline/pipeline_agent.js.map +0 -1
- package/dist/pipeline/speech_handle.cjs +0 -176
- package/dist/pipeline/speech_handle.cjs.map +0 -1
- package/dist/pipeline/speech_handle.d.cts +0 -37
- package/dist/pipeline/speech_handle.d.ts +0 -37
- package/dist/pipeline/speech_handle.d.ts.map +0 -1
- package/dist/pipeline/speech_handle.js +0 -152
- package/dist/pipeline/speech_handle.js.map +0 -1
- package/src/llm/function_context.test.ts +0 -248
- package/src/llm/function_context.ts +0 -142
- package/src/multimodal/multimodal_agent.ts +0 -555
- package/src/pipeline/agent_output.ts +0 -219
- package/src/pipeline/agent_playout.ts +0 -192
- package/src/pipeline/human_input.ts +0 -188
- package/src/pipeline/index.ts +0 -15
- package/src/pipeline/pipeline_agent.ts +0 -1185
- package/src/pipeline/speech_handle.ts +0 -201
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var audio_recognition_exports = {};
|
|
20
|
+
__export(audio_recognition_exports, {
|
|
21
|
+
AudioRecognition: () => AudioRecognition
|
|
22
|
+
});
|
|
23
|
+
module.exports = __toCommonJS(audio_recognition_exports);
|
|
24
|
+
var import_rtc_node = require("@livekit/rtc-node");
|
|
25
|
+
var import_async = require("@std/async");
|
|
26
|
+
var import_web = require("node:stream/web");
|
|
27
|
+
var import_chat_context = require("../llm/chat_context.cjs");
|
|
28
|
+
var import_log = require("../log.cjs");
|
|
29
|
+
var import_deferred_stream = require("../stream/deferred_stream.cjs");
|
|
30
|
+
var import_identity_transform = require("../stream/identity_transform.cjs");
|
|
31
|
+
var import_merge_readable_streams = require("../stream/merge_readable_streams.cjs");
|
|
32
|
+
var import_stt = require("../stt/stt.cjs");
|
|
33
|
+
var import_utils = require("../utils.cjs");
|
|
34
|
+
var import_vad = require("../vad.cjs");
|
|
35
|
+
class AudioRecognition {
|
|
36
|
+
hooks;
|
|
37
|
+
stt;
|
|
38
|
+
vad;
|
|
39
|
+
turnDetector;
|
|
40
|
+
turnDetectionMode;
|
|
41
|
+
minEndpointingDelay;
|
|
42
|
+
maxEndpointingDelay;
|
|
43
|
+
lastLanguage;
|
|
44
|
+
deferredInputStream;
|
|
45
|
+
logger = (0, import_log.log)();
|
|
46
|
+
lastFinalTranscriptTime = 0;
|
|
47
|
+
audioTranscript = "";
|
|
48
|
+
audioInterimTranscript = "";
|
|
49
|
+
lastSpeakingTime = 0;
|
|
50
|
+
userTurnCommitted = false;
|
|
51
|
+
speaking = false;
|
|
52
|
+
sampleRate;
|
|
53
|
+
vadInputStream;
|
|
54
|
+
sttInputStream;
|
|
55
|
+
silenceAudioTransform = new import_identity_transform.IdentityTransform();
|
|
56
|
+
silenceAudioWriter;
|
|
57
|
+
// all cancellable tasks
|
|
58
|
+
bounceEOUTask;
|
|
59
|
+
commitUserTurnTask;
|
|
60
|
+
vadTask;
|
|
61
|
+
sttTask;
|
|
62
|
+
constructor(opts) {
|
|
63
|
+
this.hooks = opts.recognitionHooks;
|
|
64
|
+
this.stt = opts.stt;
|
|
65
|
+
this.vad = opts.vad;
|
|
66
|
+
this.turnDetector = opts.turnDetector;
|
|
67
|
+
this.turnDetectionMode = opts.turnDetectionMode;
|
|
68
|
+
this.minEndpointingDelay = opts.minEndpointingDelay;
|
|
69
|
+
this.maxEndpointingDelay = opts.maxEndpointingDelay;
|
|
70
|
+
this.lastLanguage = void 0;
|
|
71
|
+
this.deferredInputStream = new import_deferred_stream.DeferredReadableStream();
|
|
72
|
+
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
73
|
+
this.vadInputStream = vadInputStream;
|
|
74
|
+
this.sttInputStream = (0, import_merge_readable_streams.mergeReadableStreams)(sttInputStream, this.silenceAudioTransform.readable);
|
|
75
|
+
this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Current transcript of the user's speech, including interim transcript if available.
|
|
79
|
+
*/
|
|
80
|
+
get currentTranscript() {
|
|
81
|
+
if (this.audioInterimTranscript) {
|
|
82
|
+
return `${this.audioTranscript} ${this.audioInterimTranscript}`.trim();
|
|
83
|
+
}
|
|
84
|
+
return this.audioTranscript;
|
|
85
|
+
}
|
|
86
|
+
async start() {
|
|
87
|
+
this.vadTask = import_utils.Task.from(({ signal }) => this.createVadTask(this.vad, signal));
|
|
88
|
+
this.vadTask.result.catch((err) => {
|
|
89
|
+
this.logger.error(`Error running VAD task: ${err}`);
|
|
90
|
+
});
|
|
91
|
+
this.sttTask = import_utils.Task.from(({ signal }) => this.createSttTask(this.stt, signal));
|
|
92
|
+
this.sttTask.result.catch((err) => {
|
|
93
|
+
this.logger.error(`Error running STT task: ${err}`);
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
async onSTTEvent(ev) {
|
|
97
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _i;
|
|
98
|
+
if (this.turnDetectionMode === "manual" && this.userTurnCommitted && (this.bounceEOUTask === void 0 || this.bounceEOUTask.done || ev.type == import_stt.SpeechEventType.INTERIM_TRANSCRIPT)) {
|
|
99
|
+
this.logger.debug(
|
|
100
|
+
{
|
|
101
|
+
userTurnCommitted: this.userTurnCommitted,
|
|
102
|
+
eouTaskDone: (_a = this.bounceEOUTask) == null ? void 0 : _a.done,
|
|
103
|
+
evType: ev.type,
|
|
104
|
+
turnDetectionMode: this.turnDetectionMode
|
|
105
|
+
},
|
|
106
|
+
"ignoring stt event"
|
|
107
|
+
);
|
|
108
|
+
return;
|
|
109
|
+
}
|
|
110
|
+
switch (ev.type) {
|
|
111
|
+
case import_stt.SpeechEventType.FINAL_TRANSCRIPT:
|
|
112
|
+
this.hooks.onFinalTranscript(ev);
|
|
113
|
+
const transcript = (_c = (_b = ev.alternatives) == null ? void 0 : _b[0]) == null ? void 0 : _c.text;
|
|
114
|
+
this.lastLanguage = (_e = (_d = ev.alternatives) == null ? void 0 : _d[0]) == null ? void 0 : _e.language;
|
|
115
|
+
if (!transcript) {
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
this.logger.debug(
|
|
119
|
+
{
|
|
120
|
+
user_transcript: transcript,
|
|
121
|
+
language: this.lastLanguage
|
|
122
|
+
},
|
|
123
|
+
"received user transcript"
|
|
124
|
+
);
|
|
125
|
+
this.lastFinalTranscriptTime = Date.now();
|
|
126
|
+
this.audioTranscript += ` ${transcript}`;
|
|
127
|
+
this.audioTranscript = this.audioTranscript.trimStart();
|
|
128
|
+
this.audioInterimTranscript = "";
|
|
129
|
+
if (!this.speaking) {
|
|
130
|
+
if (!this.vad) {
|
|
131
|
+
this.lastSpeakingTime = Date.now();
|
|
132
|
+
}
|
|
133
|
+
if (this.vadBaseTurnDetection || this.userTurnCommitted) {
|
|
134
|
+
const chatCtx = this.hooks.retrieveChatCtx();
|
|
135
|
+
this.logger.debug("running EOU detection on stt FINAL_TRANSCRIPT");
|
|
136
|
+
this.runEOUDetection(chatCtx);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
break;
|
|
140
|
+
case import_stt.SpeechEventType.INTERIM_TRANSCRIPT:
|
|
141
|
+
this.logger.debug({ transcript: (_g = (_f = ev.alternatives) == null ? void 0 : _f[0]) == null ? void 0 : _g.text }, "interim transcript");
|
|
142
|
+
this.hooks.onInterimTranscript(ev);
|
|
143
|
+
this.audioInterimTranscript = ((_i = (_h = ev.alternatives) == null ? void 0 : _h[0]) == null ? void 0 : _i.text) ?? "";
|
|
144
|
+
break;
|
|
145
|
+
case import_stt.SpeechEventType.END_OF_SPEECH:
|
|
146
|
+
if (this.turnDetectionMode !== "stt") break;
|
|
147
|
+
this.userTurnCommitted = true;
|
|
148
|
+
if (!this.speaking) {
|
|
149
|
+
const chatCtx = this.hooks.retrieveChatCtx();
|
|
150
|
+
this.logger.debug("running EOU detection on stt END_OF_SPEECH");
|
|
151
|
+
this.runEOUDetection(chatCtx);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
runEOUDetection(chatCtx) {
|
|
156
|
+
var _a;
|
|
157
|
+
this.logger.debug(
|
|
158
|
+
{
|
|
159
|
+
stt: this.stt,
|
|
160
|
+
audioTranscript: this.audioTranscript,
|
|
161
|
+
turnDetectionMode: this.turnDetectionMode
|
|
162
|
+
},
|
|
163
|
+
"running EOU detection"
|
|
164
|
+
);
|
|
165
|
+
if (this.stt && !this.audioTranscript && this.turnDetectionMode !== "manual") {
|
|
166
|
+
this.logger.debug("skipping EOU detection");
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
chatCtx = chatCtx.copy();
|
|
170
|
+
chatCtx.addMessage({ role: "user", content: this.audioTranscript });
|
|
171
|
+
const turnDetector = (
|
|
172
|
+
// disable EOU model if manual turn detection enabled
|
|
173
|
+
this.audioTranscript && this.turnDetectionMode !== "manual" ? this.turnDetector : void 0
|
|
174
|
+
);
|
|
175
|
+
const bounceEOUTask = (lastSpeakingTime) => async (controller) => {
|
|
176
|
+
let endpointingDelay = this.minEndpointingDelay;
|
|
177
|
+
if (turnDetector) {
|
|
178
|
+
this.logger.debug("Running turn detector model");
|
|
179
|
+
if (!turnDetector.supportsLanguage(this.lastLanguage)) {
|
|
180
|
+
this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
|
|
181
|
+
} else {
|
|
182
|
+
const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
|
|
183
|
+
this.logger.debug(
|
|
184
|
+
{ endOfTurnProbability, language: this.lastLanguage },
|
|
185
|
+
"end of turn probability"
|
|
186
|
+
);
|
|
187
|
+
const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
|
|
188
|
+
this.logger.debug(
|
|
189
|
+
{
|
|
190
|
+
unlikelyThreshold,
|
|
191
|
+
endOfTurnProbability,
|
|
192
|
+
language: this.lastLanguage,
|
|
193
|
+
transcript: this.audioTranscript
|
|
194
|
+
},
|
|
195
|
+
"EOU Detection"
|
|
196
|
+
);
|
|
197
|
+
if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
|
|
198
|
+
endpointingDelay = this.maxEndpointingDelay;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
const extraSleep = lastSpeakingTime + endpointingDelay - Date.now();
|
|
203
|
+
await (0, import_async.delay)(Math.max(extraSleep, 0), { signal: controller.signal });
|
|
204
|
+
this.logger.debug({ transcript: this.audioTranscript }, "end of user turn");
|
|
205
|
+
const committed = await this.hooks.onEndOfTurn({
|
|
206
|
+
newTranscript: this.audioTranscript,
|
|
207
|
+
transcriptionDelay: Math.max(this.lastFinalTranscriptTime - lastSpeakingTime, 0),
|
|
208
|
+
endOfUtteranceDelay: Date.now() - lastSpeakingTime
|
|
209
|
+
});
|
|
210
|
+
if (committed) {
|
|
211
|
+
this.audioTranscript = "";
|
|
212
|
+
}
|
|
213
|
+
this.userTurnCommitted = false;
|
|
214
|
+
};
|
|
215
|
+
(_a = this.bounceEOUTask) == null ? void 0 : _a.cancel();
|
|
216
|
+
this.bounceEOUTask = import_utils.Task.from(bounceEOUTask(this.lastSpeakingTime));
|
|
217
|
+
this.bounceEOUTask.result.then(() => {
|
|
218
|
+
this.logger.debug("EOU detection task completed");
|
|
219
|
+
}).catch((err) => {
|
|
220
|
+
if (err instanceof Error && err.message.includes("This operation was aborted")) {
|
|
221
|
+
return;
|
|
222
|
+
}
|
|
223
|
+
this.logger.error(err, "Error in EOU detection task:");
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
async createSttTask(stt, signal) {
|
|
227
|
+
if (!stt) return;
|
|
228
|
+
this.logger.debug("createSttTask: create stt stream from stt node");
|
|
229
|
+
const sttStream = await stt(this.sttInputStream, {});
|
|
230
|
+
if (signal.aborted || sttStream === null) return;
|
|
231
|
+
if (sttStream instanceof import_web.ReadableStream) {
|
|
232
|
+
const reader = sttStream.getReader();
|
|
233
|
+
signal.addEventListener("abort", async () => {
|
|
234
|
+
try {
|
|
235
|
+
reader.releaseLock();
|
|
236
|
+
await (sttStream == null ? void 0 : sttStream.cancel());
|
|
237
|
+
} catch (e) {
|
|
238
|
+
this.logger.debug("createSttTask: error during abort handler:", e);
|
|
239
|
+
}
|
|
240
|
+
});
|
|
241
|
+
try {
|
|
242
|
+
while (true) {
|
|
243
|
+
if (signal.aborted) break;
|
|
244
|
+
const { done, value: ev } = await reader.read();
|
|
245
|
+
if (done) break;
|
|
246
|
+
if (typeof ev === "string") {
|
|
247
|
+
throw new Error("STT node must yield SpeechEvent");
|
|
248
|
+
} else {
|
|
249
|
+
await this.onSTTEvent(ev);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
} catch (e) {
|
|
253
|
+
if ((0, import_deferred_stream.isStreamReaderReleaseError)(e)) {
|
|
254
|
+
return;
|
|
255
|
+
}
|
|
256
|
+
this.logger.error({ error: e }, "createSttTask: error reading sttStream");
|
|
257
|
+
} finally {
|
|
258
|
+
reader.releaseLock();
|
|
259
|
+
try {
|
|
260
|
+
await sttStream.cancel();
|
|
261
|
+
} catch (e) {
|
|
262
|
+
this.logger.debug(
|
|
263
|
+
"createSttTask: error cancelling sttStream (may already be cancelled):",
|
|
264
|
+
e
|
|
265
|
+
);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
async createVadTask(vad, signal) {
|
|
271
|
+
var _a;
|
|
272
|
+
if (!vad) return;
|
|
273
|
+
const vadStream = vad.stream();
|
|
274
|
+
vadStream.updateInputStream(this.vadInputStream);
|
|
275
|
+
const abortHandler = () => {
|
|
276
|
+
vadStream.detachInputStream();
|
|
277
|
+
vadStream.close();
|
|
278
|
+
signal.removeEventListener("abort", abortHandler);
|
|
279
|
+
};
|
|
280
|
+
signal.addEventListener("abort", abortHandler);
|
|
281
|
+
try {
|
|
282
|
+
for await (const ev of vadStream) {
|
|
283
|
+
if (signal.aborted) break;
|
|
284
|
+
switch (ev.type) {
|
|
285
|
+
case import_vad.VADEventType.START_OF_SPEECH:
|
|
286
|
+
this.logger.debug("VAD task: START_OF_SPEECH");
|
|
287
|
+
this.hooks.onStartOfSpeech(ev);
|
|
288
|
+
this.speaking = true;
|
|
289
|
+
(_a = this.bounceEOUTask) == null ? void 0 : _a.cancel();
|
|
290
|
+
break;
|
|
291
|
+
case import_vad.VADEventType.INFERENCE_DONE:
|
|
292
|
+
this.hooks.onVADInferenceDone(ev);
|
|
293
|
+
break;
|
|
294
|
+
case import_vad.VADEventType.END_OF_SPEECH:
|
|
295
|
+
this.logger.debug("VAD task: END_OF_SPEECH");
|
|
296
|
+
this.hooks.onEndOfSpeech(ev);
|
|
297
|
+
this.speaking = false;
|
|
298
|
+
this.lastSpeakingTime = Date.now() - ev.silenceDuration;
|
|
299
|
+
if (this.vadBaseTurnDetection || this.turnDetectionMode === "stt" && this.userTurnCommitted) {
|
|
300
|
+
const chatCtx = this.hooks.retrieveChatCtx();
|
|
301
|
+
this.runEOUDetection(chatCtx);
|
|
302
|
+
}
|
|
303
|
+
break;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
} catch (e) {
|
|
307
|
+
this.logger.error(e, "Error in VAD task");
|
|
308
|
+
} finally {
|
|
309
|
+
this.logger.debug("VAD task closed");
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
setInputAudioStream(audioStream) {
|
|
313
|
+
this.deferredInputStream.setSource(audioStream);
|
|
314
|
+
}
|
|
315
|
+
detachInputAudioStream() {
|
|
316
|
+
this.deferredInputStream.detachSource();
|
|
317
|
+
}
|
|
318
|
+
clearUserTurn() {
|
|
319
|
+
var _a;
|
|
320
|
+
this.audioTranscript = "";
|
|
321
|
+
this.audioInterimTranscript = "";
|
|
322
|
+
this.userTurnCommitted = false;
|
|
323
|
+
(_a = this.sttTask) == null ? void 0 : _a.cancelAndWait().finally(() => {
|
|
324
|
+
this.sttTask = import_utils.Task.from(({ signal }) => this.createSttTask(this.stt, signal));
|
|
325
|
+
this.sttTask.result.catch((err) => {
|
|
326
|
+
this.logger.error(`Error running STT task: ${err}`);
|
|
327
|
+
});
|
|
328
|
+
});
|
|
329
|
+
}
|
|
330
|
+
commitUserTurn(audioDetached) {
|
|
331
|
+
var _a;
|
|
332
|
+
const commitUserTurnTask = (delayDuration = 500) => async (controller) => {
|
|
333
|
+
if (Date.now() - this.lastFinalTranscriptTime > delayDuration) {
|
|
334
|
+
if (audioDetached && this.sampleRate !== void 0) {
|
|
335
|
+
const numSamples = Math.floor(this.sampleRate * 0.5);
|
|
336
|
+
const silence = new Int16Array(numSamples * 2);
|
|
337
|
+
const silenceFrame = new import_rtc_node.AudioFrame(silence, this.sampleRate, 1, numSamples);
|
|
338
|
+
this.silenceAudioWriter.write(silenceFrame);
|
|
339
|
+
}
|
|
340
|
+
await (0, import_async.delay)(delayDuration, { signal: controller.signal });
|
|
341
|
+
}
|
|
342
|
+
if (this.audioInterimTranscript) {
|
|
343
|
+
this.audioTranscript = `${this.audioTranscript} ${this.audioInterimTranscript}`.trim();
|
|
344
|
+
}
|
|
345
|
+
this.audioInterimTranscript = "";
|
|
346
|
+
const chatCtx = this.hooks.retrieveChatCtx();
|
|
347
|
+
this.logger.debug("running EOU detection on commitUserTurn");
|
|
348
|
+
this.runEOUDetection(chatCtx);
|
|
349
|
+
this.userTurnCommitted = true;
|
|
350
|
+
};
|
|
351
|
+
(_a = this.commitUserTurnTask) == null ? void 0 : _a.cancel();
|
|
352
|
+
this.commitUserTurnTask = import_utils.Task.from(commitUserTurnTask());
|
|
353
|
+
this.commitUserTurnTask.result.then(() => {
|
|
354
|
+
this.logger.debug("User turn committed");
|
|
355
|
+
}).catch((err) => {
|
|
356
|
+
this.logger.error(err, "Error in user turn commit task:");
|
|
357
|
+
});
|
|
358
|
+
}
|
|
359
|
+
async close() {
|
|
360
|
+
var _a, _b, _c, _d;
|
|
361
|
+
this.detachInputAudioStream();
|
|
362
|
+
await ((_a = this.commitUserTurnTask) == null ? void 0 : _a.cancelAndWait());
|
|
363
|
+
await ((_b = this.sttTask) == null ? void 0 : _b.cancelAndWait());
|
|
364
|
+
await ((_c = this.vadTask) == null ? void 0 : _c.cancelAndWait());
|
|
365
|
+
await ((_d = this.bounceEOUTask) == null ? void 0 : _d.cancelAndWait());
|
|
366
|
+
}
|
|
367
|
+
get vadBaseTurnDetection() {
|
|
368
|
+
return ["vad", void 0].includes(this.turnDetectionMode);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
372
|
+
0 && (module.exports = {
|
|
373
|
+
AudioRecognition
|
|
374
|
+
});
|
|
375
|
+
//# sourceMappingURL=audio_recognition.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/voice/audio_recognition.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2025 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { AudioFrame } from '@livekit/rtc-node';\nimport { delay } from '@std/async';\nimport type { WritableStreamDefaultWriter } from 'node:stream/web';\nimport { ReadableStream } from 'node:stream/web';\nimport { type ChatContext } from '../llm/chat_context.js';\nimport { log } from '../log.js';\nimport { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';\nimport { IdentityTransform } from '../stream/identity_transform.js';\nimport { mergeReadableStreams } from '../stream/merge_readable_streams.js';\nimport { type SpeechEvent, SpeechEventType } from '../stt/stt.js';\nimport { Task } from '../utils.js';\nimport { type VAD, type VADEvent, VADEventType } from '../vad.js';\nimport type { TurnDetectionMode } from './agent_session.js';\nimport type { STTNode } from './io.js';\n\nexport interface EndOfTurnInfo {\n newTranscript: string;\n transcriptionDelay: number;\n endOfUtteranceDelay: number;\n}\n\nexport interface RecognitionHooks {\n onStartOfSpeech: (ev: VADEvent) => void;\n onVADInferenceDone: (ev: VADEvent) => void;\n onEndOfSpeech: (ev: VADEvent) => void;\n onInterimTranscript: (ev: SpeechEvent) => void;\n onFinalTranscript: (ev: SpeechEvent) => void;\n onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;\n\n retrieveChatCtx: () => ChatContext;\n}\n\nexport interface _TurnDetector {\n unlikelyThreshold: (language?: string) => Promise<number | undefined>;\n supportsLanguage: (language?: string) => Promise<boolean>;\n predictEndOfTurn(chatCtx: ChatContext): Promise<number>;\n}\n\nexport interface AudioRecognitionOptions {\n recognitionHooks: RecognitionHooks;\n stt?: STTNode;\n vad?: VAD;\n turnDetector?: _TurnDetector;\n turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;\n minEndpointingDelay: number;\n maxEndpointingDelay: number;\n}\n\nexport class AudioRecognition {\n private hooks: RecognitionHooks;\n private stt?: STTNode;\n private vad?: VAD;\n private turnDetector?: _TurnDetector;\n private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;\n private minEndpointingDelay: number;\n private maxEndpointingDelay: number;\n private lastLanguage?: string;\n\n private deferredInputStream: DeferredReadableStream<AudioFrame>;\n private logger = log();\n private lastFinalTranscriptTime = 0;\n private audioTranscript = '';\n private audioInterimTranscript = '';\n private lastSpeakingTime = 0;\n private userTurnCommitted = false;\n private speaking = false;\n private sampleRate?: number;\n\n private vadInputStream: ReadableStream<AudioFrame>;\n private sttInputStream: ReadableStream<AudioFrame>;\n private silenceAudioTransform = new IdentityTransform<AudioFrame>();\n private silenceAudioWriter: WritableStreamDefaultWriter<AudioFrame>;\n\n // all cancellable tasks\n private bounceEOUTask?: Task<void>;\n private commitUserTurnTask?: Task<void>;\n private vadTask?: Task<void>;\n private sttTask?: Task<void>;\n\n constructor(opts: AudioRecognitionOptions) {\n this.hooks = opts.recognitionHooks;\n this.stt = opts.stt;\n this.vad = opts.vad;\n this.turnDetector = opts.turnDetector;\n this.turnDetectionMode = opts.turnDetectionMode;\n this.minEndpointingDelay = opts.minEndpointingDelay;\n this.maxEndpointingDelay = opts.maxEndpointingDelay;\n this.lastLanguage = undefined;\n\n this.deferredInputStream = new DeferredReadableStream<AudioFrame>();\n const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();\n this.vadInputStream = vadInputStream;\n this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable);\n this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();\n }\n\n /**\n * Current transcript of the user's speech, including interim transcript if available.\n */\n get currentTranscript(): string {\n if (this.audioInterimTranscript) {\n return `${this.audioTranscript} ${this.audioInterimTranscript}`.trim();\n }\n return this.audioTranscript;\n }\n\n async start() {\n this.vadTask = Task.from(({ signal }) => this.createVadTask(this.vad, signal));\n this.vadTask.result.catch((err) => {\n this.logger.error(`Error running VAD task: ${err}`);\n });\n\n this.sttTask = Task.from(({ signal }) => this.createSttTask(this.stt, signal));\n this.sttTask.result.catch((err) => {\n this.logger.error(`Error running STT task: ${err}`);\n });\n }\n\n private async onSTTEvent(ev: SpeechEvent) {\n if (\n this.turnDetectionMode === 'manual' &&\n this.userTurnCommitted &&\n (this.bounceEOUTask === undefined ||\n this.bounceEOUTask.done ||\n ev.type == SpeechEventType.INTERIM_TRANSCRIPT)\n ) {\n // ignore stt event if user turn already committed and EOU task is done\n // or it's an interim transcript\n this.logger.debug(\n {\n userTurnCommitted: this.userTurnCommitted,\n eouTaskDone: this.bounceEOUTask?.done,\n evType: ev.type,\n turnDetectionMode: this.turnDetectionMode,\n },\n 'ignoring stt event',\n );\n return;\n }\n\n switch (ev.type) {\n case SpeechEventType.FINAL_TRANSCRIPT:\n this.hooks.onFinalTranscript(ev);\n const transcript = ev.alternatives?.[0]?.text;\n this.lastLanguage = ev.alternatives?.[0]?.language;\n\n if (!transcript) {\n // stt final transcript received but no transcript\n return;\n }\n\n this.logger.debug(\n {\n user_transcript: transcript,\n language: this.lastLanguage,\n },\n 'received user transcript',\n );\n\n this.lastFinalTranscriptTime = Date.now();\n this.audioTranscript += ` ${transcript}`;\n this.audioTranscript = this.audioTranscript.trimStart();\n this.audioInterimTranscript = '';\n\n if (!this.speaking) {\n if (!this.vad) {\n // Copied from python agents:\n // vad disabled, use stt timestamp\n // TODO: this would screw up transcription latency metrics\n // but we'll live with it for now.\n // the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH\n // and using that timestamp for _last_speaking_time\n this.lastSpeakingTime = Date.now();\n }\n\n if (this.vadBaseTurnDetection || this.userTurnCommitted) {\n const chatCtx = this.hooks.retrieveChatCtx();\n this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT');\n this.runEOUDetection(chatCtx);\n }\n }\n break;\n case SpeechEventType.INTERIM_TRANSCRIPT:\n this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript');\n this.hooks.onInterimTranscript(ev);\n this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? '';\n break;\n case SpeechEventType.END_OF_SPEECH:\n if (this.turnDetectionMode !== 'stt') break;\n this.userTurnCommitted = true;\n\n if (!this.speaking) {\n const chatCtx = this.hooks.retrieveChatCtx();\n this.logger.debug('running EOU detection on stt END_OF_SPEECH');\n this.runEOUDetection(chatCtx);\n }\n }\n }\n\n private runEOUDetection(chatCtx: ChatContext) {\n this.logger.debug(\n {\n stt: this.stt,\n audioTranscript: this.audioTranscript,\n turnDetectionMode: this.turnDetectionMode,\n },\n 'running EOU detection',\n );\n\n if (this.stt && !this.audioTranscript && this.turnDetectionMode !== 'manual') {\n // stt enabled but no transcript yet\n this.logger.debug('skipping EOU detection');\n return;\n }\n\n chatCtx = chatCtx.copy();\n chatCtx.addMessage({ role: 'user', content: this.audioTranscript });\n\n const turnDetector =\n // disable EOU model if manual turn detection enabled\n this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined;\n\n const bounceEOUTask = (lastSpeakingTime: number) => async (controller: AbortController) => {\n let endpointingDelay = this.minEndpointingDelay;\n\n // TODO(AJS-74): need to support actual turn detection model plugins for following code to run\n if (turnDetector) {\n this.logger.debug('Running turn detector model');\n if (!turnDetector.supportsLanguage(this.lastLanguage)) {\n this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);\n } else {\n const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);\n this.logger.debug(\n { endOfTurnProbability, language: this.lastLanguage },\n 'end of turn probability',\n );\n\n const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);\n this.logger.debug(\n {\n unlikelyThreshold,\n endOfTurnProbability,\n language: this.lastLanguage,\n transcript: this.audioTranscript,\n },\n 'EOU Detection',\n );\n\n if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {\n endpointingDelay = this.maxEndpointingDelay;\n }\n }\n }\n\n const extraSleep = lastSpeakingTime + endpointingDelay - Date.now();\n // add delay to see if there's a potential upcoming EOU task that cancels this one\n await delay(Math.max(extraSleep, 0), { signal: controller.signal });\n\n this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');\n\n const committed = await this.hooks.onEndOfTurn({\n newTranscript: this.audioTranscript,\n transcriptionDelay: Math.max(this.lastFinalTranscriptTime - lastSpeakingTime, 0),\n endOfUtteranceDelay: Date.now() - lastSpeakingTime,\n });\n\n if (committed) {\n // clear the transcript if the user turn was committed\n this.audioTranscript = '';\n }\n\n this.userTurnCommitted = false;\n };\n\n // cancel any existing EOU task\n this.bounceEOUTask?.cancel();\n this.bounceEOUTask = Task.from(bounceEOUTask(this.lastSpeakingTime));\n\n this.bounceEOUTask.result\n .then(() => {\n this.logger.debug('EOU detection task completed');\n })\n .catch((err: unknown) => {\n if (err instanceof Error && err.message.includes('This operation was aborted')) {\n // ignore aborted errors\n return;\n }\n this.logger.error(err, 'Error in EOU detection task:');\n });\n }\n\n private async createSttTask(stt: STTNode | undefined, signal: AbortSignal) {\n if (!stt) return;\n\n this.logger.debug('createSttTask: create stt stream from stt node');\n\n const sttStream = await stt(this.sttInputStream, {});\n\n if (signal.aborted || sttStream === null) return;\n\n if (sttStream instanceof ReadableStream) {\n const reader = sttStream.getReader();\n\n signal.addEventListener('abort', async () => {\n try {\n reader.releaseLock();\n await sttStream?.cancel();\n } catch (e) {\n this.logger.debug('createSttTask: error during abort handler:', e);\n }\n });\n\n try {\n while (true) {\n if (signal.aborted) break;\n\n const { done, value: ev } = await reader.read();\n if (done) break;\n\n if (typeof ev === 'string') {\n throw new Error('STT node must yield SpeechEvent');\n } else {\n await this.onSTTEvent(ev);\n }\n }\n } catch (e) {\n if (isStreamReaderReleaseError(e)) {\n return;\n }\n this.logger.error({ error: e }, 'createSttTask: error reading sttStream');\n } finally {\n reader.releaseLock();\n try {\n await sttStream.cancel();\n } catch (e) {\n this.logger.debug(\n 'createSttTask: error cancelling sttStream (may already be cancelled):',\n e,\n );\n }\n }\n }\n }\n\n private async createVadTask(vad: VAD | undefined, signal: AbortSignal) {\n if (!vad) return;\n\n const vadStream = vad.stream();\n vadStream.updateInputStream(this.vadInputStream);\n\n const abortHandler = () => {\n vadStream.detachInputStream();\n vadStream.close();\n signal.removeEventListener('abort', abortHandler);\n };\n signal.addEventListener('abort', abortHandler);\n\n try {\n for await (const ev of vadStream) {\n if (signal.aborted) break;\n\n switch (ev.type) {\n case VADEventType.START_OF_SPEECH:\n this.logger.debug('VAD task: START_OF_SPEECH');\n this.hooks.onStartOfSpeech(ev);\n this.speaking = true;\n\n this.bounceEOUTask?.cancel();\n break;\n case VADEventType.INFERENCE_DONE:\n this.hooks.onVADInferenceDone(ev);\n break;\n case VADEventType.END_OF_SPEECH:\n this.logger.debug('VAD task: END_OF_SPEECH');\n this.hooks.onEndOfSpeech(ev);\n this.speaking = false;\n // when VAD fires END_OF_SPEECH, it already waited for the silence_duration\n this.lastSpeakingTime = Date.now() - ev.silenceDuration;\n\n if (\n this.vadBaseTurnDetection ||\n (this.turnDetectionMode === 'stt' && this.userTurnCommitted)\n ) {\n const chatCtx = this.hooks.retrieveChatCtx();\n this.runEOUDetection(chatCtx);\n }\n break;\n }\n }\n } catch (e) {\n this.logger.error(e, 'Error in VAD task');\n } finally {\n this.logger.debug('VAD task closed');\n }\n }\n\n setInputAudioStream(audioStream: ReadableStream<AudioFrame>) {\n this.deferredInputStream.setSource(audioStream);\n }\n\n detachInputAudioStream() {\n this.deferredInputStream.detachSource();\n }\n\n clearUserTurn() {\n this.audioTranscript = '';\n this.audioInterimTranscript = '';\n this.userTurnCommitted = false;\n\n this.sttTask?.cancelAndWait().finally(() => {\n this.sttTask = Task.from(({ signal }) => this.createSttTask(this.stt, signal));\n this.sttTask.result.catch((err) => {\n this.logger.error(`Error running STT task: ${err}`);\n });\n });\n }\n\n commitUserTurn(audioDetached: boolean) {\n const commitUserTurnTask =\n (delayDuration: number = 500) =>\n async (controller: AbortController) => {\n if (Date.now() - this.lastFinalTranscriptTime > delayDuration) {\n // flush the stt by pushing silence\n if (audioDetached && this.sampleRate !== undefined) {\n const numSamples = Math.floor(this.sampleRate * 0.5);\n const silence = new Int16Array(numSamples * 2);\n const silenceFrame = new AudioFrame(silence, this.sampleRate, 1, numSamples);\n this.silenceAudioWriter.write(silenceFrame);\n }\n\n // wait for the final transcript to be available\n await delay(delayDuration, { signal: controller.signal });\n }\n\n if (this.audioInterimTranscript) {\n // append interim transcript in case the final transcript is not ready\n this.audioTranscript = `${this.audioTranscript} ${this.audioInterimTranscript}`.trim();\n }\n this.audioInterimTranscript = '';\n\n const chatCtx = this.hooks.retrieveChatCtx();\n this.logger.debug('running EOU detection on commitUserTurn');\n this.runEOUDetection(chatCtx);\n this.userTurnCommitted = true;\n };\n\n // cancel any existing commit user turn task\n this.commitUserTurnTask?.cancel();\n this.commitUserTurnTask = Task.from(commitUserTurnTask());\n\n this.commitUserTurnTask.result\n .then(() => {\n this.logger.debug('User turn committed');\n })\n .catch((err: unknown) => {\n this.logger.error(err, 'Error in user turn commit task:');\n });\n }\n\n async close() {\n this.detachInputAudioStream();\n await this.commitUserTurnTask?.cancelAndWait();\n await this.sttTask?.cancelAndWait();\n await this.vadTask?.cancelAndWait();\n await this.bounceEOUTask?.cancelAndWait();\n }\n\n private get vadBaseTurnDetection() {\n return ['vad', undefined].includes(this.turnDetectionMode);\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,sBAA2B;AAC3B,mBAAsB;AAEtB,iBAA+B;AAC/B,0BAAiC;AACjC,iBAAoB;AACpB,6BAAmE;AACnE,gCAAkC;AAClC,oCAAqC;AACrC,iBAAkD;AAClD,mBAAqB;AACrB,iBAAsD;AAqC/C,MAAM,iBAAiB;AAAA,EACpB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EAEA;AAAA,EACA,aAAS,gBAAI;AAAA,EACb,0BAA0B;AAAA,EAC1B,kBAAkB;AAAA,EAClB,yBAAyB;AAAA,EACzB,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,WAAW;AAAA,EACX;AAAA,EAEA;AAAA,EACA;AAAA,EACA,wBAAwB,IAAI,4CAA8B;AAAA,EAC1D;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EAER,YAAY,MAA+B;AACzC,SAAK,QAAQ,KAAK;AAClB,SAAK,MAAM,KAAK;AAChB,SAAK,MAAM,KAAK;AAChB,SAAK,eAAe,KAAK;AACzB,SAAK,oBAAoB,KAAK;AAC9B,SAAK,sBAAsB,KAAK;AAChC,SAAK,sBAAsB,KAAK;AAChC,SAAK,eAAe;AAEpB,SAAK,sBAAsB,IAAI,8CAAmC;AAClE,UAAM,CAAC,gBAAgB,cAAc,IAAI,KAAK,oBAAoB,OAAO,IAAI;AAC7E,SAAK,iBAAiB;AACtB,SAAK,qBAAiB,oDAAqB,gBAAgB,KAAK,sBAAsB,QAAQ;AAC9F,SAAK,qBAAqB,KAAK,sBAAsB,SAAS,UAAU;AAAA,EAC1E;AAAA;AAAA;AAAA;AAAA,EAKA,IAAI,oBAA4B;AAC9B,QAAI,KAAK,wBAAwB;AAC/B,aAAO,GAAG,KAAK,eAAe,IAAI,KAAK,sBAAsB,GAAG,KAAK;AAAA,IACvE;AACA,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU,kBAAK,KAAK,CAAC,EAAE,OAAO,MAAM,KAAK,cAAc,KAAK,KAAK,MAAM,CAAC;AAC7E,SAAK,QAAQ,OAAO,MAAM,CAAC,QAAQ;AACjC,WAAK,OAAO,MAAM,2BAA2B,GAAG,EAAE;AAAA,IACpD,CAAC;AAED,SAAK,UAAU,kBAAK,KAAK,CAAC,EAAE,OAAO,MAAM,KAAK,cAAc,KAAK,KAAK,MAAM,CAAC;AAC7E,SAAK,QAAQ,OAAO,MAAM,CAAC,QAAQ;AACjC,WAAK,OAAO,MAAM,2BAA2B,GAAG,EAAE;AAAA,IACpD,CAAC;AAAA,EACH;AAAA,EAEA,MAAc,WAAW,IAAiB;AAzH5C;AA0HI,QACE,KAAK,sBAAsB,YAC3B,KAAK,sBACJ,KAAK,kBAAkB,UACtB,KAAK,cAAc,QACnB,GAAG,QAAQ,2BAAgB,qBAC7B;AAGA,WAAK,OAAO;AAAA,QACV;AAAA,UACE,mBAAmB,KAAK;AAAA,UACxB,cAAa,UAAK,kBAAL,mBAAoB;AAAA,UACjC,QAAQ,GAAG;AAAA,UACX,mBAAmB,KAAK;AAAA,QAC1B;AAAA,QACA;AAAA,MACF;AACA;AAAA,IACF;AAEA,YAAQ,GAAG,MAAM;AAAA,MACf,KAAK,2BAAgB;AACnB,aAAK,MAAM,kBAAkB,EAAE;AAC/B,cAAM,cAAa,cAAG,iBAAH,mBAAkB,OAAlB,mBAAsB;AACzC,aAAK,gBAAe,cAAG,iBAAH,mBAAkB,OAAlB,mBAAsB;AAE1C,YAAI,CAAC,YAAY;AAEf;AAAA,QACF;AAEA,aAAK,OAAO;AAAA,UACV;AAAA,YACE,iBAAiB;AAAA,YACjB,UAAU,KAAK;AAAA,UACjB;AAAA,UACA;AAAA,QACF;AAEA,aAAK,0BAA0B,KAAK,IAAI;AACxC,aAAK,mBAAmB,IAAI,UAAU;AACtC,aAAK,kBAAkB,KAAK,gBAAgB,UAAU;AACtD,aAAK,yBAAyB;AAE9B,YAAI,CAAC,KAAK,UAAU;AAClB,cAAI,CAAC,KAAK,KAAK;AAOb,iBAAK,mBAAmB,KAAK,IAAI;AAAA,UACnC;AAEA,cAAI,KAAK,wBAAwB,KAAK,mBAAmB;AACvD,kBAAM,UAAU,KAAK,MAAM,gBAAgB;AAC3C,iBAAK,OAAO,MAAM,+CAA+C;AACjE,iBAAK,gBAAgB,OAAO;AAAA,UAC9B;AAAA,QACF;AACA;AAAA,MACF,KAAK,2BAAgB;AACnB,aAAK,OAAO,MAAM,EAAE,aAAY,cAAG,iBAAH,mBAAkB,OAAlB,mBAAsB,KAAK,GAAG,oBAAoB;AAClF,aAAK,MAAM,oBAAoB,EAAE;AACjC,aAAK,2BAAyB,cAAG,iBAAH,mBAAkB,OAAlB,mBAAsB,SAAQ;AAC5D;AAAA,MACF,KAAK,2BAAgB;AACnB,YAAI,KAAK,sBAAsB,MAAO;AACtC,aAAK,oBAAoB;AAEzB,YAAI,CAAC,KAAK,UAAU;AAClB,gBAAM,UAAU,KAAK,MAAM,gBAAgB;AAC3C,eAAK,OAAO,MAAM,4CAA4C;AAC9D,eAAK,gBAAgB,OAAO;AAAA,QAC9B;AAAA,IACJ;AAAA,EACF;AAAA,EAEQ,gBAAgB,SAAsB;AA1MhD;AA2MI,SAAK,OAAO;AAAA,MACV;AAAA,QACE,KAAK,KAAK;AAAA,QACV,iBAAiB,KAAK;AAAA,QACtB,mBAAmB,KAAK;AAAA,MAC1B;AAAA,MACA;AAAA,IACF;AAEA,QAAI,KAAK,OAAO,CAAC,KAAK,mBAAmB,KAAK,sBAAsB,UAAU;AAE5E,WAAK,OAAO,MAAM,wBAAwB;AAC1C;AAAA,IACF;AAEA,cAAU,QAAQ,KAAK;AACvB,YAAQ,WAAW,EAAE,MAAM,QAAQ,SAAS,KAAK,gBAAgB,CAAC;AAElE,UAAM;AAAA;AAAA,MAEJ,KAAK,mBAAmB,KAAK,sBAAsB,WAAW,KAAK,eAAe;AAAA;AAEpF,UAAM,gBAAgB,CAAC,qBAA6B,OAAO,eAAgC;AACzF,UAAI,mBAAmB,KAAK;AAG5B,UAAI,cAAc;AAChB,aAAK,OAAO,MAAM,6BAA6B;AAC/C,YAAI,CAAC,aAAa,iBAAiB,KAAK,YAAY,GAAG;AACrD,eAAK,OAAO,MAAM,2CAA2C,KAAK,YAAY,EAAE;AAAA,QAClF,OAAO;AACL,gBAAM,uBAAuB,MAAM,aAAa,iBAAiB,OAAO;AACxE,eAAK,OAAO;AAAA,YACV,EAAE,sBAAsB,UAAU,KAAK,aAAa;AAAA,YACpD;AAAA,UACF;AAEA,gBAAM,oBAAoB,MAAM,aAAa,kBAAkB,KAAK,YAAY;AAChF,eAAK,OAAO;AAAA,YACV;AAAA,cACE;AAAA,cACA;AAAA,cACA,UAAU,KAAK;AAAA,cACf,YAAY,KAAK;AAAA,YACnB;AAAA,YACA;AAAA,UACF;AAEA,cAAI,qBAAqB,uBAAuB,mBAAmB;AACjE,+BAAmB,KAAK;AAAA,UAC1B;AAAA,QACF;AAAA,MACF;AAEA,YAAM,aAAa,mBAAmB,mBAAmB,KAAK,IAAI;AAElE,gBAAM,oBAAM,KAAK,IAAI,YAAY,CAAC,GAAG,EAAE,QAAQ,WAAW,OAAO,CAAC;AAElE,WAAK,OAAO,MAAM,EAAE,YAAY,KAAK,gBAAgB,GAAG,kBAAkB;AAE1E,YAAM,YAAY,MAAM,KAAK,MAAM,YAAY;AAAA,QAC7C,eAAe,KAAK;AAAA,QACpB,oBAAoB,KAAK,IAAI,KAAK,0BAA0B,kBAAkB,CAAC;AAAA,QAC/E,qBAAqB,KAAK,IAAI,IAAI;AAAA,MACpC,CAAC;AAED,UAAI,WAAW;AAEb,aAAK,kBAAkB;AAAA,MACzB;AAEA,WAAK,oBAAoB;AAAA,IAC3B;AAGA,eAAK,kBAAL,mBAAoB;AACpB,SAAK,gBAAgB,kBAAK,KAAK,cAAc,KAAK,gBAAgB,CAAC;AAEnE,SAAK,cAAc,OAChB,KAAK,MAAM;AACV,WAAK,OAAO,MAAM,8BAA8B;AAAA,IAClD,CAAC,EACA,MAAM,CAAC,QAAiB;AACvB,UAAI,eAAe,SAAS,IAAI,QAAQ,SAAS,4BAA4B,GAAG;AAE9E;AAAA,MACF;AACA,WAAK,OAAO,MAAM,KAAK,8BAA8B;AAAA,IACvD,CAAC;AAAA,EACL;AAAA,EAEA,MAAc,cAAc,KAA0B,QAAqB;AACzE,QAAI,CAAC,IAAK;AAEV,SAAK,OAAO,MAAM,gDAAgD;AAElE,UAAM,YAAY,MAAM,IAAI,KAAK,gBAAgB,CAAC,CAAC;AAEnD,QAAI,OAAO,WAAW,cAAc,KAAM;AAE1C,QAAI,qBAAqB,2BAAgB;AACvC,YAAM,SAAS,UAAU,UAAU;AAEnC,aAAO,iBAAiB,SAAS,YAAY;AAC3C,YAAI;AACF,iBAAO,YAAY;AACnB,iBAAM,uCAAW;AAAA,QACnB,SAAS,GAAG;AACV,eAAK,OAAO,MAAM,8CAA8C,CAAC;AAAA,QACnE;AAAA,MACF,CAAC;AAED,UAAI;AACF,eAAO,MAAM;AACX,cAAI,OAAO,QAAS;AAEpB,gBAAM,EAAE,MAAM,OAAO,GAAG,IAAI,MAAM,OAAO,KAAK;AAC9C,cAAI,KAAM;AAEV,cAAI,OAAO,OAAO,UAAU;AAC1B,kBAAM,IAAI,MAAM,iCAAiC;AAAA,UACnD,OAAO;AACL,kBAAM,KAAK,WAAW,EAAE;AAAA,UAC1B;AAAA,QACF;AAAA,MACF,SAAS,GAAG;AACV,gBAAI,mDAA2B,CAAC,GAAG;AACjC;AAAA,QACF;AACA,aAAK,OAAO,MAAM,EAAE,OAAO,EAAE,GAAG,wCAAwC;AAAA,MAC1E,UAAE;AACA,eAAO,YAAY;AACnB,YAAI;AACF,gBAAM,UAAU,OAAO;AAAA,QACzB,SAAS,GAAG;AACV,eAAK,OAAO;AAAA,YACV;AAAA,YACA;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,cAAc,KAAsB,QAAqB;AA3VzE;AA4VI,QAAI,CAAC,IAAK;AAEV,UAAM,YAAY,IAAI,OAAO;AAC7B,cAAU,kBAAkB,KAAK,cAAc;AAE/C,UAAM,eAAe,MAAM;AACzB,gBAAU,kBAAkB;AAC5B,gBAAU,MAAM;AAChB,aAAO,oBAAoB,SAAS,YAAY;AAAA,IAClD;AACA,WAAO,iBAAiB,SAAS,YAAY;AAE7C,QAAI;AACF,uBAAiB,MAAM,WAAW;AAChC,YAAI,OAAO,QAAS;AAEpB,gBAAQ,GAAG,MAAM;AAAA,UACf,KAAK,wBAAa;AAChB,iBAAK,OAAO,MAAM,2BAA2B;AAC7C,iBAAK,MAAM,gBAAgB,EAAE;AAC7B,iBAAK,WAAW;AAEhB,uBAAK,kBAAL,mBAAoB;AACpB;AAAA,UACF,KAAK,wBAAa;AAChB,iBAAK,MAAM,mBAAmB,EAAE;AAChC;AAAA,UACF,KAAK,wBAAa;AAChB,iBAAK,OAAO,MAAM,yBAAyB;AAC3C,iBAAK,MAAM,cAAc,EAAE;AAC3B,iBAAK,WAAW;AAEhB,iBAAK,mBAAmB,KAAK,IAAI,IAAI,GAAG;AAExC,gBACE,KAAK,wBACJ,KAAK,sBAAsB,SAAS,KAAK,mBAC1C;AACA,oBAAM,UAAU,KAAK,MAAM,gBAAgB;AAC3C,mBAAK,gBAAgB,OAAO;AAAA,YAC9B;AACA;AAAA,QACJ;AAAA,MACF;AAAA,IACF,SAAS,GAAG;AACV,WAAK,OAAO,MAAM,GAAG,mBAAmB;AAAA,IAC1C,UAAE;AACA,WAAK,OAAO,MAAM,iBAAiB;AAAA,IACrC;AAAA,EACF;AAAA,EAEA,oBAAoB,aAAyC;AAC3D,SAAK,oBAAoB,UAAU,WAAW;AAAA,EAChD;AAAA,EAEA,yBAAyB;AACvB,SAAK,oBAAoB,aAAa;AAAA,EACxC;AAAA,EAEA,gBAAgB;AAvZlB;AAwZI,SAAK,kBAAkB;AACvB,SAAK,yBAAyB;AAC9B,SAAK,oBAAoB;AAEzB,eAAK,YAAL,mBAAc,gBAAgB,QAAQ,MAAM;AAC1C,WAAK,UAAU,kBAAK,KAAK,CAAC,EAAE,OAAO,MAAM,KAAK,cAAc,KAAK,KAAK,MAAM,CAAC;AAC7E,WAAK,QAAQ,OAAO,MAAM,CAAC,QAAQ;AACjC,aAAK,OAAO,MAAM,2BAA2B,GAAG,EAAE;AAAA,MACpD,CAAC;AAAA,IACH;AAAA,EACF;AAAA,EAEA,eAAe,eAAwB;AApazC;AAqaI,UAAM,qBACJ,CAAC,gBAAwB,QACzB,OAAO,eAAgC;AACrC,UAAI,KAAK,IAAI,IAAI,KAAK,0BAA0B,eAAe;AAE7D,YAAI,iBAAiB,KAAK,eAAe,QAAW;AAClD,gBAAM,aAAa,KAAK,MAAM,KAAK,aAAa,GAAG;AACnD,gBAAM,UAAU,IAAI,WAAW,aAAa,CAAC;AAC7C,gBAAM,eAAe,IAAI,2BAAW,SAAS,KAAK,YAAY,GAAG,UAAU;AAC3E,eAAK,mBAAmB,MAAM,YAAY;AAAA,QAC5C;AAGA,kBAAM,oBAAM,eAAe,EAAE,QAAQ,WAAW,OAAO,CAAC;AAAA,MAC1D;AAEA,UAAI,KAAK,wBAAwB;AAE/B,aAAK,kBAAkB,GAAG,KAAK,eAAe,IAAI,KAAK,sBAAsB,GAAG,KAAK;AAAA,MACvF;AACA,WAAK,yBAAyB;AAE9B,YAAM,UAAU,KAAK,MAAM,gBAAgB;AAC3C,WAAK,OAAO,MAAM,yCAAyC;AAC3D,WAAK,gBAAgB,OAAO;AAC5B,WAAK,oBAAoB;AAAA,IAC3B;AAGF,eAAK,uBAAL,mBAAyB;AACzB,SAAK,qBAAqB,kBAAK,KAAK,mBAAmB,CAAC;AAExD,SAAK,mBAAmB,OACrB,KAAK,MAAM;AACV,WAAK,OAAO,MAAM,qBAAqB;AAAA,IACzC,CAAC,EACA,MAAM,CAAC,QAAiB;AACvB,WAAK,OAAO,MAAM,KAAK,iCAAiC;AAAA,IAC1D,CAAC;AAAA,EACL;AAAA,EAEA,MAAM,QAAQ;AA9chB;AA+cI,SAAK,uBAAuB;AAC5B,YAAM,UAAK,uBAAL,mBAAyB;AAC/B,YAAM,UAAK,YAAL,mBAAc;AACpB,YAAM,UAAK,YAAL,mBAAc;AACpB,YAAM,UAAK,kBAAL,mBAAoB;AAAA,EAC5B;AAAA,EAEA,IAAY,uBAAuB;AACjC,WAAO,CAAC,OAAO,MAAS,EAAE,SAAS,KAAK,iBAAiB;AAAA,EAC3D;AACF;","names":[]}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/// <reference types="node" resolution-mode="require"/>
|
|
2
|
+
import { AudioFrame } from '@livekit/rtc-node';
|
|
3
|
+
import { ReadableStream } from 'node:stream/web';
|
|
4
|
+
import { type ChatContext } from '../llm/chat_context.js';
|
|
5
|
+
import { type SpeechEvent } from '../stt/stt.js';
|
|
6
|
+
import { type VAD, type VADEvent } from '../vad.js';
|
|
7
|
+
import type { TurnDetectionMode } from './agent_session.js';
|
|
8
|
+
import type { STTNode } from './io.js';
|
|
9
|
+
export interface EndOfTurnInfo {
|
|
10
|
+
newTranscript: string;
|
|
11
|
+
transcriptionDelay: number;
|
|
12
|
+
endOfUtteranceDelay: number;
|
|
13
|
+
}
|
|
14
|
+
export interface RecognitionHooks {
|
|
15
|
+
onStartOfSpeech: (ev: VADEvent) => void;
|
|
16
|
+
onVADInferenceDone: (ev: VADEvent) => void;
|
|
17
|
+
onEndOfSpeech: (ev: VADEvent) => void;
|
|
18
|
+
onInterimTranscript: (ev: SpeechEvent) => void;
|
|
19
|
+
onFinalTranscript: (ev: SpeechEvent) => void;
|
|
20
|
+
onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
|
|
21
|
+
retrieveChatCtx: () => ChatContext;
|
|
22
|
+
}
|
|
23
|
+
export interface _TurnDetector {
|
|
24
|
+
unlikelyThreshold: (language?: string) => Promise<number | undefined>;
|
|
25
|
+
supportsLanguage: (language?: string) => Promise<boolean>;
|
|
26
|
+
predictEndOfTurn(chatCtx: ChatContext): Promise<number>;
|
|
27
|
+
}
|
|
28
|
+
export interface AudioRecognitionOptions {
|
|
29
|
+
recognitionHooks: RecognitionHooks;
|
|
30
|
+
stt?: STTNode;
|
|
31
|
+
vad?: VAD;
|
|
32
|
+
turnDetector?: _TurnDetector;
|
|
33
|
+
turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
|
|
34
|
+
minEndpointingDelay: number;
|
|
35
|
+
maxEndpointingDelay: number;
|
|
36
|
+
}
|
|
37
|
+
export declare class AudioRecognition {
|
|
38
|
+
private hooks;
|
|
39
|
+
private stt?;
|
|
40
|
+
private vad?;
|
|
41
|
+
private turnDetector?;
|
|
42
|
+
private turnDetectionMode?;
|
|
43
|
+
private minEndpointingDelay;
|
|
44
|
+
private maxEndpointingDelay;
|
|
45
|
+
private lastLanguage?;
|
|
46
|
+
private deferredInputStream;
|
|
47
|
+
private logger;
|
|
48
|
+
private lastFinalTranscriptTime;
|
|
49
|
+
private audioTranscript;
|
|
50
|
+
private audioInterimTranscript;
|
|
51
|
+
private lastSpeakingTime;
|
|
52
|
+
private userTurnCommitted;
|
|
53
|
+
private speaking;
|
|
54
|
+
private sampleRate?;
|
|
55
|
+
private vadInputStream;
|
|
56
|
+
private sttInputStream;
|
|
57
|
+
private silenceAudioTransform;
|
|
58
|
+
private silenceAudioWriter;
|
|
59
|
+
private bounceEOUTask?;
|
|
60
|
+
private commitUserTurnTask?;
|
|
61
|
+
private vadTask?;
|
|
62
|
+
private sttTask?;
|
|
63
|
+
constructor(opts: AudioRecognitionOptions);
|
|
64
|
+
/**
|
|
65
|
+
* Current transcript of the user's speech, including interim transcript if available.
|
|
66
|
+
*/
|
|
67
|
+
get currentTranscript(): string;
|
|
68
|
+
start(): Promise<void>;
|
|
69
|
+
private onSTTEvent;
|
|
70
|
+
private runEOUDetection;
|
|
71
|
+
private createSttTask;
|
|
72
|
+
private createVadTask;
|
|
73
|
+
setInputAudioStream(audioStream: ReadableStream<AudioFrame>): void;
|
|
74
|
+
detachInputAudioStream(): void;
|
|
75
|
+
clearUserTurn(): void;
|
|
76
|
+
commitUserTurn(audioDetached: boolean): void;
|
|
77
|
+
close(): Promise<void>;
|
|
78
|
+
private get vadBaseTurnDetection();
|
|
79
|
+
}
|
|
80
|
+
//# sourceMappingURL=audio_recognition.d.ts.map
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/// <reference types="node" resolution-mode="require"/>
|
|
2
|
+
import { AudioFrame } from '@livekit/rtc-node';
|
|
3
|
+
import { ReadableStream } from 'node:stream/web';
|
|
4
|
+
import { type ChatContext } from '../llm/chat_context.js';
|
|
5
|
+
import { type SpeechEvent } from '../stt/stt.js';
|
|
6
|
+
import { type VAD, type VADEvent } from '../vad.js';
|
|
7
|
+
import type { TurnDetectionMode } from './agent_session.js';
|
|
8
|
+
import type { STTNode } from './io.js';
|
|
9
|
+
export interface EndOfTurnInfo {
|
|
10
|
+
newTranscript: string;
|
|
11
|
+
transcriptionDelay: number;
|
|
12
|
+
endOfUtteranceDelay: number;
|
|
13
|
+
}
|
|
14
|
+
export interface RecognitionHooks {
|
|
15
|
+
onStartOfSpeech: (ev: VADEvent) => void;
|
|
16
|
+
onVADInferenceDone: (ev: VADEvent) => void;
|
|
17
|
+
onEndOfSpeech: (ev: VADEvent) => void;
|
|
18
|
+
onInterimTranscript: (ev: SpeechEvent) => void;
|
|
19
|
+
onFinalTranscript: (ev: SpeechEvent) => void;
|
|
20
|
+
onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
|
|
21
|
+
retrieveChatCtx: () => ChatContext;
|
|
22
|
+
}
|
|
23
|
+
export interface _TurnDetector {
|
|
24
|
+
unlikelyThreshold: (language?: string) => Promise<number | undefined>;
|
|
25
|
+
supportsLanguage: (language?: string) => Promise<boolean>;
|
|
26
|
+
predictEndOfTurn(chatCtx: ChatContext): Promise<number>;
|
|
27
|
+
}
|
|
28
|
+
export interface AudioRecognitionOptions {
|
|
29
|
+
recognitionHooks: RecognitionHooks;
|
|
30
|
+
stt?: STTNode;
|
|
31
|
+
vad?: VAD;
|
|
32
|
+
turnDetector?: _TurnDetector;
|
|
33
|
+
turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
|
|
34
|
+
minEndpointingDelay: number;
|
|
35
|
+
maxEndpointingDelay: number;
|
|
36
|
+
}
|
|
37
|
+
export declare class AudioRecognition {
|
|
38
|
+
private hooks;
|
|
39
|
+
private stt?;
|
|
40
|
+
private vad?;
|
|
41
|
+
private turnDetector?;
|
|
42
|
+
private turnDetectionMode?;
|
|
43
|
+
private minEndpointingDelay;
|
|
44
|
+
private maxEndpointingDelay;
|
|
45
|
+
private lastLanguage?;
|
|
46
|
+
private deferredInputStream;
|
|
47
|
+
private logger;
|
|
48
|
+
private lastFinalTranscriptTime;
|
|
49
|
+
private audioTranscript;
|
|
50
|
+
private audioInterimTranscript;
|
|
51
|
+
private lastSpeakingTime;
|
|
52
|
+
private userTurnCommitted;
|
|
53
|
+
private speaking;
|
|
54
|
+
private sampleRate?;
|
|
55
|
+
private vadInputStream;
|
|
56
|
+
private sttInputStream;
|
|
57
|
+
private silenceAudioTransform;
|
|
58
|
+
private silenceAudioWriter;
|
|
59
|
+
private bounceEOUTask?;
|
|
60
|
+
private commitUserTurnTask?;
|
|
61
|
+
private vadTask?;
|
|
62
|
+
private sttTask?;
|
|
63
|
+
constructor(opts: AudioRecognitionOptions);
|
|
64
|
+
/**
|
|
65
|
+
* Current transcript of the user's speech, including interim transcript if available.
|
|
66
|
+
*/
|
|
67
|
+
get currentTranscript(): string;
|
|
68
|
+
start(): Promise<void>;
|
|
69
|
+
private onSTTEvent;
|
|
70
|
+
private runEOUDetection;
|
|
71
|
+
private createSttTask;
|
|
72
|
+
private createVadTask;
|
|
73
|
+
setInputAudioStream(audioStream: ReadableStream<AudioFrame>): void;
|
|
74
|
+
detachInputAudioStream(): void;
|
|
75
|
+
clearUserTurn(): void;
|
|
76
|
+
commitUserTurn(audioDetached: boolean): void;
|
|
77
|
+
close(): Promise<void>;
|
|
78
|
+
private get vadBaseTurnDetection();
|
|
79
|
+
}
|
|
80
|
+
//# sourceMappingURL=audio_recognition.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"audio_recognition.d.ts","sourceRoot":"","sources":["../../src/voice/audio_recognition.ts"],"names":[],"mappings":";AAGA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAG/C,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAK1D,OAAO,EAAE,KAAK,WAAW,EAAmB,MAAM,eAAe,CAAC;AAElE,OAAO,EAAE,KAAK,GAAG,EAAE,KAAK,QAAQ,EAAgB,MAAM,WAAW,CAAC;AAClE,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAC5D,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAEvC,MAAM,WAAW,aAAa;IAC5B,aAAa,EAAE,MAAM,CAAC;IACtB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,WAAW,gBAAgB;IAC/B,eAAe,EAAE,CAAC,EAAE,EAAE,QAAQ,KAAK,IAAI,CAAC;IACxC,kBAAkB,EAAE,CAAC,EAAE,EAAE,QAAQ,KAAK,IAAI,CAAC;IAC3C,aAAa,EAAE,CAAC,EAAE,EAAE,QAAQ,KAAK,IAAI,CAAC;IACtC,mBAAmB,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,IAAI,CAAC;IAC/C,iBAAiB,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,IAAI,CAAC;IAC7C,WAAW,EAAE,CAAC,IAAI,EAAE,aAAa,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IAEvD,eAAe,EAAE,MAAM,WAAW,CAAC;CACpC;AAED,MAAM,WAAW,aAAa;IAC5B,iBAAiB,EAAE,CAAC,QAAQ,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC,CAAC;IACtE,gBAAgB,EAAE,CAAC,QAAQ,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IAC1D,gBAAgB,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACzD;AAED,MAAM,WAAW,uBAAuB;IACtC,gBAAgB,EAAE,gBAAgB,CAAC;IACnC,GAAG,CAAC,EAAE,OAAO,CAAC;IACd,GAAG,CAAC,EAAE,GAAG,CAAC;IACV,YAAY,CAAC,EAAE,aAAa,CAAC;IAC7B,iBAAiB,CAAC,EAAE,OAAO,CAAC,iBAAiB,EAAE,aAAa,CAAC,CAAC;IAC9D,mBAAmB,EAAE,MAAM,CAAC;IAC5B,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,KAAK,CAAmB;IAChC,OAAO,CAAC,GAAG,CAAC,CAAU;IACtB,OAAO,CAAC,GAAG,CAAC,CAAM;IAClB,OAAO,CAAC,YAAY,CAAC,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAC,CAA4C;IACtE,OAAO,CAAC,mBAAmB,CAAS;IACpC,OAAO,CAAC,mBAAmB,CAAS;IACpC,OAAO,CAAC,YAAY,CAAC,CAAS;IAE9B,OAAO,CAAC,mBAAmB,CAAqC;IAChE,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,uBAAuB,CAAK;IACpC,OAAO,CAAC,eAAe,CAAM;IAC7B,OAAO,CAAC,sBAAsB,CAAM;IACpC,OAAO,CAAC,gBAAgB,CAAK;IAC7B,OAAO,CAAC,iBAAiB,CAAS;IAClC,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,UAAU,CAAC,CAAS;IAE5B,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,qBAAqB,CAAuC;IACpE,OAAO,CAAC,kBAAkB,CAA0C;IAGpE,OAAO,CAAC,aAAa,CAAC,CAAa;IACnC,OAAO,CAAC,kBAAkB,CAAC,CAAa;IACxC,OAAO,CAAC,OAAO,CAAC,CAAa;IAC7B,OAAO,CAAC,OAAO,CAAC,CAAa;gBAEjB,IAAI,EAAE,uBAAuB;IAiBzC;;OAEG;IACH,IAAI,iBAAiB,IAAI,MAAM,CAK9B;IAEK,KAAK;YAYG,UAAU;IAiFxB,OAAO,CAAC,eAAe;YA4FT,aAAa;YAqDb,aAAa;IAoD3B,mBAAmB,CAAC,WAAW,EAAE,cAAc,CAAC,UAAU,CAAC;IAI3D,sBAAsB;IAItB,aAAa;IAab,cAAc,CAAC,aAAa,EAAE,OAAO;IA0C/B,KAAK;IAQX,OAAO,KAAK,oBAAoB,GAE/B;CACF"}
|