@livekit/agents 0.6.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/dist/index.cjs +6 -1
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +3 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +3 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/inference_runner.cjs +38 -0
  8. package/dist/inference_runner.cjs.map +1 -0
  9. package/dist/inference_runner.d.ts +11 -0
  10. package/dist/inference_runner.d.ts.map +1 -0
  11. package/dist/inference_runner.js +14 -0
  12. package/dist/inference_runner.js.map +1 -0
  13. package/dist/ipc/index.cjs +23 -0
  14. package/dist/ipc/index.cjs.map +1 -0
  15. package/dist/ipc/index.d.ts +2 -0
  16. package/dist/ipc/index.d.ts.map +1 -0
  17. package/dist/ipc/index.js +2 -0
  18. package/dist/ipc/index.js.map +1 -0
  19. package/dist/ipc/inference_executor.cjs +17 -0
  20. package/dist/ipc/inference_executor.cjs.map +1 -0
  21. package/dist/ipc/inference_executor.d.ts +4 -0
  22. package/dist/ipc/inference_executor.d.ts.map +1 -0
  23. package/dist/ipc/inference_executor.js +1 -0
  24. package/dist/ipc/inference_executor.js.map +1 -0
  25. package/dist/ipc/inference_proc_executor.cjs +97 -0
  26. package/dist/ipc/inference_proc_executor.cjs.map +1 -0
  27. package/dist/ipc/inference_proc_executor.d.ts +23 -0
  28. package/dist/ipc/inference_proc_executor.d.ts.map +1 -0
  29. package/dist/ipc/inference_proc_executor.js +72 -0
  30. package/dist/ipc/inference_proc_executor.js.map +1 -0
  31. package/dist/ipc/inference_proc_lazy_main.cjs +90 -0
  32. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -0
  33. package/dist/ipc/inference_proc_lazy_main.d.ts +2 -0
  34. package/dist/ipc/inference_proc_lazy_main.d.ts.map +1 -0
  35. package/dist/ipc/inference_proc_lazy_main.js +67 -0
  36. package/dist/ipc/inference_proc_lazy_main.js.map +1 -0
  37. package/dist/ipc/job_executor.cjs +8 -7
  38. package/dist/ipc/job_executor.cjs.map +1 -1
  39. package/dist/ipc/job_executor.d.ts +14 -15
  40. package/dist/ipc/job_executor.d.ts.map +1 -1
  41. package/dist/ipc/job_executor.js +7 -6
  42. package/dist/ipc/job_executor.js.map +1 -1
  43. package/dist/ipc/job_proc_executor.cjs +108 -0
  44. package/dist/ipc/job_proc_executor.cjs.map +1 -0
  45. package/dist/ipc/job_proc_executor.d.ts +19 -0
  46. package/dist/ipc/job_proc_executor.d.ts.map +1 -0
  47. package/dist/ipc/job_proc_executor.js +83 -0
  48. package/dist/ipc/job_proc_executor.js.map +1 -0
  49. package/dist/ipc/{job_main.cjs → job_proc_lazy_main.cjs} +41 -36
  50. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -0
  51. package/dist/ipc/job_proc_lazy_main.d.ts +2 -0
  52. package/dist/ipc/job_proc_lazy_main.d.ts.map +1 -0
  53. package/dist/ipc/{job_main.js → job_proc_lazy_main.js} +41 -11
  54. package/dist/ipc/job_proc_lazy_main.js.map +1 -0
  55. package/dist/ipc/message.cjs.map +1 -1
  56. package/dist/ipc/message.d.ts +17 -0
  57. package/dist/ipc/message.d.ts.map +1 -1
  58. package/dist/ipc/proc_pool.cjs +30 -4
  59. package/dist/ipc/proc_pool.cjs.map +1 -1
  60. package/dist/ipc/proc_pool.d.ts +5 -1
  61. package/dist/ipc/proc_pool.d.ts.map +1 -1
  62. package/dist/ipc/proc_pool.js +30 -4
  63. package/dist/ipc/proc_pool.js.map +1 -1
  64. package/dist/ipc/{proc_job_executor.cjs → supervised_proc.cjs} +58 -46
  65. package/dist/ipc/supervised_proc.cjs.map +1 -0
  66. package/dist/ipc/supervised_proc.d.ts +30 -0
  67. package/dist/ipc/supervised_proc.d.ts.map +1 -0
  68. package/dist/ipc/{proc_job_executor.js → supervised_proc.js} +54 -32
  69. package/dist/ipc/supervised_proc.js.map +1 -0
  70. package/dist/job.cjs +18 -1
  71. package/dist/job.cjs.map +1 -1
  72. package/dist/job.d.ts +9 -1
  73. package/dist/job.d.ts.map +1 -1
  74. package/dist/job.js +17 -1
  75. package/dist/job.js.map +1 -1
  76. package/dist/metrics/base.cjs +2 -2
  77. package/dist/metrics/base.cjs.map +1 -1
  78. package/dist/metrics/base.d.ts +1 -1
  79. package/dist/metrics/base.d.ts.map +1 -1
  80. package/dist/metrics/base.js +2 -2
  81. package/dist/metrics/base.js.map +1 -1
  82. package/dist/multimodal/agent_playout.cjs +13 -14
  83. package/dist/multimodal/agent_playout.cjs.map +1 -1
  84. package/dist/multimodal/agent_playout.d.ts +4 -4
  85. package/dist/multimodal/agent_playout.d.ts.map +1 -1
  86. package/dist/multimodal/agent_playout.js +13 -14
  87. package/dist/multimodal/agent_playout.js.map +1 -1
  88. package/dist/multimodal/multimodal_agent.cjs +12 -8
  89. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  90. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  91. package/dist/multimodal/multimodal_agent.js +13 -9
  92. package/dist/multimodal/multimodal_agent.js.map +1 -1
  93. package/dist/pipeline/agent_output.cjs +20 -4
  94. package/dist/pipeline/agent_output.cjs.map +1 -1
  95. package/dist/pipeline/agent_output.d.ts +4 -2
  96. package/dist/pipeline/agent_output.d.ts.map +1 -1
  97. package/dist/pipeline/agent_output.js +20 -4
  98. package/dist/pipeline/agent_output.js.map +1 -1
  99. package/dist/pipeline/agent_playout.cjs +9 -3
  100. package/dist/pipeline/agent_playout.cjs.map +1 -1
  101. package/dist/pipeline/agent_playout.d.ts +4 -2
  102. package/dist/pipeline/agent_playout.d.ts.map +1 -1
  103. package/dist/pipeline/agent_playout.js +9 -3
  104. package/dist/pipeline/agent_playout.js.map +1 -1
  105. package/dist/pipeline/human_input.cjs +6 -0
  106. package/dist/pipeline/human_input.cjs.map +1 -1
  107. package/dist/pipeline/human_input.d.ts +3 -1
  108. package/dist/pipeline/human_input.d.ts.map +1 -1
  109. package/dist/pipeline/human_input.js +6 -0
  110. package/dist/pipeline/human_input.js.map +1 -1
  111. package/dist/pipeline/pipeline_agent.cjs +79 -12
  112. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  113. package/dist/pipeline/pipeline_agent.d.ts +8 -0
  114. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  115. package/dist/pipeline/pipeline_agent.js +79 -12
  116. package/dist/pipeline/pipeline_agent.js.map +1 -1
  117. package/dist/stt/stream_adapter.cjs +16 -4
  118. package/dist/stt/stream_adapter.cjs.map +1 -1
  119. package/dist/stt/stream_adapter.d.ts.map +1 -1
  120. package/dist/stt/stream_adapter.js +16 -4
  121. package/dist/stt/stream_adapter.js.map +1 -1
  122. package/dist/tokenize/basic/basic.cjs +2 -0
  123. package/dist/tokenize/basic/basic.cjs.map +1 -1
  124. package/dist/tokenize/basic/basic.d.ts +2 -0
  125. package/dist/tokenize/basic/basic.d.ts.map +1 -1
  126. package/dist/tokenize/basic/basic.js +1 -0
  127. package/dist/tokenize/basic/basic.js.map +1 -1
  128. package/dist/tokenize/basic/index.cjs +2 -0
  129. package/dist/tokenize/basic/index.cjs.map +1 -1
  130. package/dist/tokenize/basic/index.d.ts +1 -1
  131. package/dist/tokenize/basic/index.d.ts.map +1 -1
  132. package/dist/tokenize/basic/index.js +8 -1
  133. package/dist/tokenize/basic/index.js.map +1 -1
  134. package/dist/tokenize/token_stream.cjs +5 -3
  135. package/dist/tokenize/token_stream.cjs.map +1 -1
  136. package/dist/tokenize/token_stream.d.ts.map +1 -1
  137. package/dist/tokenize/token_stream.js +5 -3
  138. package/dist/tokenize/token_stream.js.map +1 -1
  139. package/dist/transcription.cjs +203 -86
  140. package/dist/transcription.cjs.map +1 -1
  141. package/dist/transcription.d.ts +24 -17
  142. package/dist/transcription.d.ts.map +1 -1
  143. package/dist/transcription.js +201 -85
  144. package/dist/transcription.js.map +1 -1
  145. package/dist/worker.cjs +42 -9
  146. package/dist/worker.cjs.map +1 -1
  147. package/dist/worker.d.ts +5 -1
  148. package/dist/worker.d.ts.map +1 -1
  149. package/dist/worker.js +42 -9
  150. package/dist/worker.js.map +1 -1
  151. package/package.json +3 -3
  152. package/src/index.ts +3 -1
  153. package/src/inference_runner.ts +19 -0
  154. package/src/ipc/index.ts +5 -0
  155. package/src/ipc/inference_executor.ts +7 -0
  156. package/src/ipc/inference_proc_executor.ts +93 -0
  157. package/src/ipc/inference_proc_lazy_main.ts +86 -0
  158. package/src/ipc/job_executor.ts +15 -17
  159. package/src/ipc/job_proc_executor.ts +112 -0
  160. package/src/ipc/{job_main.ts → job_proc_lazy_main.ts} +44 -14
  161. package/src/ipc/message.ts +14 -1
  162. package/src/ipc/proc_pool.ts +33 -3
  163. package/src/ipc/{proc_job_executor.ts → supervised_proc.ts} +80 -30
  164. package/src/job.ts +21 -0
  165. package/src/metrics/base.ts +7 -10
  166. package/src/multimodal/agent_playout.ts +14 -16
  167. package/src/multimodal/multimodal_agent.ts +13 -9
  168. package/src/pipeline/agent_output.ts +34 -5
  169. package/src/pipeline/agent_playout.ts +10 -1
  170. package/src/pipeline/human_input.ts +8 -0
  171. package/src/pipeline/pipeline_agent.ts +96 -11
  172. package/src/stt/stream_adapter.ts +17 -5
  173. package/src/tokenize/basic/basic.ts +2 -0
  174. package/src/tokenize/basic/index.ts +7 -1
  175. package/src/tokenize/token_stream.ts +6 -3
  176. package/src/transcription.ts +270 -96
  177. package/src/worker.ts +42 -5
  178. package/dist/ipc/job_main.cjs.map +0 -1
  179. package/dist/ipc/job_main.d.ts +0 -8
  180. package/dist/ipc/job_main.d.ts.map +0 -1
  181. package/dist/ipc/job_main.js.map +0 -1
  182. package/dist/ipc/proc_job_executor.cjs.map +0 -1
  183. package/dist/ipc/proc_job_executor.d.ts +0 -15
  184. package/dist/ipc/proc_job_executor.d.ts.map +0 -1
  185. package/dist/ipc/proc_job_executor.js.map +0 -1
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/multimodal/multimodal_agent.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type {\n LocalTrackPublication,\n RemoteAudioTrack,\n RemoteParticipant,\n RemoteTrack,\n RemoteTrackPublication,\n Room,\n} from '@livekit/rtc-node';\nimport {\n AudioSource,\n AudioStream,\n LocalAudioTrack,\n RoomEvent,\n TrackPublishOptions,\n TrackSource,\n} from '@livekit/rtc-node';\nimport { EventEmitter } from 'node:events';\nimport { AudioByteStream } from '../audio.js';\nimport * as llm from '../llm/index.js';\nimport { log } from '../log.js';\nimport type { MultimodalLLMMetrics } from '../metrics/base.js';\nimport { BasicTranscriptionForwarder } from '../transcription.js';\nimport { findMicroTrackId } from '../utils.js';\nimport { AgentPlayout, type PlayoutHandle } from './agent_playout.js';\n\n/**\n * @internal\n * @beta\n */\nexport abstract class RealtimeSession extends EventEmitter {\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n abstract conversation: any; // openai.realtime.Conversation\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n abstract inputAudioBuffer: any; // openai.realtime.InputAudioBuffer\n abstract fncCtx: llm.FunctionContext | undefined;\n abstract recoverFromTextResponse(itemId: string): void;\n}\n\n/**\n * @internal\n * @beta\n */\nexport abstract class RealtimeModel {\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n abstract session(options: any): RealtimeSession; // openai.realtime.ModelOptions\n abstract close(): Promise<void>;\n abstract sampleRate: number;\n abstract numChannels: number;\n abstract inFrameSize: number;\n abstract outFrameSize: number;\n}\n\nexport type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';\nexport const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';\n\n/** @beta */\nexport class MultimodalAgent extends EventEmitter {\n model: RealtimeModel;\n room: Room | null = null;\n linkedParticipant: RemoteParticipant | null = null;\n subscribedTrack: RemoteAudioTrack | null = null;\n readMicroTask: Promise<void> | null = null;\n\n #textResponseRetries = 0;\n #maxTextResponseRetries: number;\n\n constructor({\n model,\n chatCtx,\n fncCtx,\n maxTextResponseRetries = 5,\n }: {\n model: RealtimeModel;\n chatCtx?: llm.ChatContext;\n fncCtx?: llm.FunctionContext;\n maxTextResponseRetries?: number;\n }) {\n super();\n this.model = model;\n this.#chatCtx = chatCtx;\n this.#fncCtx = fncCtx;\n this.#maxTextResponseRetries = maxTextResponseRetries;\n }\n\n #participant: RemoteParticipant | string | null = null;\n #agentPublication: LocalTrackPublication | null = null;\n #localTrackSid: string | null = null;\n #localSource: AudioSource | null = null;\n #agentPlayout: AgentPlayout | null = null;\n #playingHandle: PlayoutHandle | undefined = undefined;\n #logger = log();\n #session: RealtimeSession | null = null;\n #fncCtx: llm.FunctionContext | undefined = undefined;\n #chatCtx: llm.ChatContext | undefined = undefined;\n\n #_started: boolean = false;\n #_pendingFunctionCalls: Set<string> = new Set();\n #_speaking: boolean = false;\n\n get fncCtx(): llm.FunctionContext | undefined {\n return this.#fncCtx;\n }\n\n set fncCtx(ctx: llm.FunctionContext | undefined) {\n this.#fncCtx = ctx;\n if (this.#session) {\n this.#session.fncCtx = ctx;\n }\n }\n\n get #pendingFunctionCalls(): Set<string> {\n return this.#_pendingFunctionCalls;\n }\n\n set #pendingFunctionCalls(calls: Set<string>) {\n this.#_pendingFunctionCalls = calls;\n this.#updateState();\n }\n\n get #speaking(): boolean {\n return this.#_speaking;\n }\n\n set #speaking(isSpeaking: boolean) {\n this.#_speaking = isSpeaking;\n this.#updateState();\n }\n\n get #started(): boolean {\n return this.#_started;\n }\n\n set #started(started: boolean) {\n this.#_started = started;\n this.#updateState();\n }\n\n start(\n room: Room,\n participant: RemoteParticipant | string | null = null,\n ): Promise<RealtimeSession> {\n return new Promise(async (resolve, reject) => {\n if (this.#started) {\n reject(new Error('MultimodalAgent already started'));\n }\n this.#updateState();\n\n room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {\n // automatically link to the first participant that connects, if not already linked\n if (this.linkedParticipant) {\n return;\n }\n this.#linkParticipant(participant.identity!);\n });\n room.on(\n RoomEvent.TrackPublished,\n (trackPublication: RemoteTrackPublication, participant: RemoteParticipant) => {\n if (\n this.linkedParticipant &&\n participant.identity === this.linkedParticipant.identity &&\n trackPublication.source === TrackSource.SOURCE_MICROPHONE &&\n !trackPublication.subscribed\n ) {\n trackPublication.setSubscribed(true);\n }\n },\n );\n room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));\n\n this.room = room;\n this.#participant = participant;\n\n this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);\n this.#agentPlayout = new AgentPlayout(\n this.#localSource,\n this.model.sampleRate,\n this.model.numChannels,\n this.model.inFrameSize,\n this.model.outFrameSize,\n );\n const onPlayoutStarted = () => {\n this.emit('agent_started_speaking');\n this.#speaking = true;\n };\n\n const onPlayoutStopped = (interrupted: boolean) => {\n this.emit('agent_stopped_speaking');\n this.#speaking = false;\n if (this.#playingHandle) {\n let text = this.#playingHandle.transcriptionFwd.text;\n if (interrupted) {\n text += '…';\n }\n const msg = llm.ChatMessage.create({\n role: llm.ChatRole.ASSISTANT,\n text,\n });\n\n if (interrupted) {\n this.emit('agent_speech_interrupted', msg);\n } else {\n this.emit('agent_speech_committed', msg);\n }\n this.#logger.child({ transcription: text, interrupted }).debug('committed agent speech');\n }\n };\n\n this.#agentPlayout.on('playout_started', onPlayoutStarted);\n this.#agentPlayout.on('playout_stopped', onPlayoutStopped);\n\n const track = LocalAudioTrack.createAudioTrack('assistant_voice', this.#localSource);\n const options = new TrackPublishOptions();\n options.source = TrackSource.SOURCE_MICROPHONE;\n this.#agentPublication = (await room.localParticipant?.publishTrack(track, options)) || null;\n if (!this.#agentPublication) {\n this.#logger.error('Failed to publish track');\n reject(new Error('Failed to publish track'));\n return;\n }\n\n await this.#agentPublication.waitForSubscription();\n\n if (participant) {\n if (typeof participant === 'string') {\n this.#linkParticipant(participant);\n } else {\n this.#linkParticipant(participant.identity!);\n }\n } else {\n // No participant specified, try to find the first participant in the room\n for (const participant of room.remoteParticipants.values()) {\n this.#linkParticipant(participant.identity!);\n break;\n }\n }\n\n this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });\n this.#started = true;\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('response_content_added', (message: any) => {\n // openai.realtime.RealtimeContent\n if (message.contentType === 'text') return;\n\n const trFwd = new BasicTranscriptionForwarder(\n this.room!,\n this.room!.localParticipant!.identity!,\n this.#getLocalTrackSid()!,\n message.responseId,\n );\n\n const handle = this.#agentPlayout?.play(\n message.itemId,\n message.contentIndex,\n trFwd,\n message.textStream,\n message.audioStream,\n );\n this.#playingHandle = handle;\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('response_content_done', (message: any) => {\n // openai.realtime.RealtimeContent\n if (message.contentType === 'text') {\n if (this.#textResponseRetries >= this.#maxTextResponseRetries) {\n throw new Error(\n 'The OpenAI Realtime API returned a text response ' +\n `after ${this.#maxTextResponseRetries} retries. ` +\n 'Please try to reduce the number of text system or ' +\n 'assistant messages in the chat context.',\n );\n }\n\n this.#textResponseRetries++;\n this.#logger\n .child({\n itemId: message.itemId,\n text: message.text,\n retries: this.#textResponseRetries,\n })\n .warn(\n 'The OpenAI Realtime API returned a text response instead of audio. ' +\n 'Attempting to recover to audio mode...',\n );\n this.#session!.recoverFromTextResponse(message.itemId);\n } else {\n this.#textResponseRetries = 0;\n }\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('input_speech_committed', (ev: any) => {\n // openai.realtime.InputSpeechCommittedEvent\n const participantIdentity = this.linkedParticipant?.identity;\n const trackSid = this.subscribedTrack?.sid;\n if (participantIdentity && trackSid) {\n this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);\n } else {\n this.#logger.error('Participant or track not set');\n }\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('input_speech_transcription_completed', (ev: any) => {\n // openai.realtime.InputSpeechTranscriptionCompletedEvent\n const transcription = ev.transcript;\n const participantIdentity = this.linkedParticipant?.identity;\n const trackSid = this.subscribedTrack?.sid;\n if (participantIdentity && trackSid) {\n this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);\n } else {\n this.#logger.error('Participant or track not set');\n }\n const userMsg = llm.ChatMessage.create({\n role: llm.ChatRole.USER,\n text: transcription,\n });\n this.emit('user_speech_committed', userMsg);\n this.#logger.child({ transcription }).debug('committed user speech');\n });\n\n this.#session.on('input_speech_started', (ev: any) => {\n this.emit('user_started_speaking');\n if (this.#playingHandle && !this.#playingHandle.done) {\n this.#playingHandle.interrupt();\n\n this.#session!.conversation.item.truncate(\n this.#playingHandle.itemId,\n this.#playingHandle.contentIndex,\n Math.floor((this.#playingHandle.audioSamples / 24000) * 1000),\n );\n\n this.#playingHandle = undefined;\n }\n\n const participantIdentity = this.linkedParticipant?.identity;\n const trackSid = this.subscribedTrack?.sid;\n if (participantIdentity && trackSid) {\n this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);\n }\n });\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n this.#session.on('input_speech_stopped', (ev: any) => {\n this.emit('user_stopped_speaking');\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('function_call_started', (ev: any) => {\n this.#pendingFunctionCalls.add(ev.callId);\n this.#updateState();\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('function_call_completed', (ev: any) => {\n this.#pendingFunctionCalls.delete(ev.callId);\n this.#updateState();\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('function_call_failed', (ev: any) => {\n this.#pendingFunctionCalls.delete(ev.callId);\n this.#updateState();\n });\n\n this.#session.on('metrics_collected', (metrics: MultimodalLLMMetrics) => {\n this.emit('metrics_collected', metrics);\n });\n\n resolve(this.#session);\n });\n }\n\n #linkParticipant(participantIdentity: string): void {\n if (!this.room) {\n this.#logger.error('Room is not set');\n return;\n }\n\n this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;\n if (!this.linkedParticipant) {\n this.#logger.error(`Participant with identity ${participantIdentity} not found`);\n return;\n }\n\n if (this.linkedParticipant.trackPublications.size > 0) {\n this.#subscribeToMicrophone();\n }\n\n // also check if already subscribed\n for (const publication of this.linkedParticipant.trackPublications.values()) {\n if (publication.source === TrackSource.SOURCE_MICROPHONE && publication.track) {\n this.#handleTrackSubscription(publication.track, publication, this.linkedParticipant);\n break;\n }\n }\n }\n\n #subscribeToMicrophone(): void {\n if (!this.linkedParticipant) {\n this.#logger.error('Participant is not set');\n return;\n }\n\n let microphonePublication: RemoteTrackPublication | undefined = undefined;\n for (const publication of this.linkedParticipant.trackPublications.values()) {\n if (publication.source === TrackSource.SOURCE_MICROPHONE) {\n microphonePublication = publication;\n break;\n }\n }\n if (!microphonePublication) {\n return;\n }\n\n if (!microphonePublication.subscribed) {\n microphonePublication.setSubscribed(true);\n }\n }\n\n #handleTrackSubscription(\n track: RemoteTrack,\n publication: RemoteTrackPublication,\n participant: RemoteParticipant,\n ) {\n if (\n publication.source !== TrackSource.SOURCE_MICROPHONE ||\n participant.identity !== this.linkedParticipant?.identity\n ) {\n return;\n }\n const readAudioStreamTask = async (audioStream: AudioStream) => {\n const bstream = new AudioByteStream(\n this.model.sampleRate,\n this.model.numChannels,\n this.model.inFrameSize,\n );\n\n for await (const frame of audioStream) {\n const audioData = frame.data;\n for (const frame of bstream.write(audioData.buffer)) {\n this.#session!.inputAudioBuffer.append(frame);\n }\n }\n };\n this.subscribedTrack = track;\n\n this.readMicroTask = new Promise<void>((resolve, reject) => {\n readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))\n .then(resolve)\n .catch(reject);\n });\n }\n\n #getLocalTrackSid(): string | null {\n if (!this.#localTrackSid && this.room && this.room.localParticipant) {\n this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant!.identity!);\n }\n return this.#localTrackSid;\n }\n\n #publishTranscription(\n participantIdentity: string,\n trackSid: string,\n text: string,\n isFinal: boolean,\n id: string,\n ): void {\n this.#logger.debug(\n `Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`,\n );\n if (!this.room?.localParticipant) {\n this.#logger.error('Room or local participant not set');\n return;\n }\n\n this.room.localParticipant.publishTranscription({\n participantIdentity,\n trackSid,\n segments: [\n {\n text,\n final: isFinal,\n id,\n startTime: BigInt(0),\n endTime: BigInt(0),\n language: '',\n },\n ],\n });\n }\n\n #updateState() {\n let newState: AgentState = 'initializing';\n if (this.#pendingFunctionCalls.size > 0) {\n newState = 'thinking';\n } else if (this.#speaking) {\n newState = 'speaking';\n } else if (this.#started) {\n newState = 'listening';\n }\n\n this.#setState(newState);\n }\n\n #setState(state: AgentState) {\n if (this.room?.isConnected && this.room.localParticipant) {\n const currentState = this.room.localParticipant.attributes![AGENT_STATE_ATTRIBUTE];\n if (currentState !== state) {\n this.room.localParticipant.setAttributes({\n [AGENT_STATE_ATTRIBUTE]: state,\n });\n this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);\n }\n }\n }\n}\n"],"mappings":"AAWA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP,SAAS,oBAAoB;AAC7B,SAAS,uBAAuB;AAChC,YAAY,SAAS;AACrB,SAAS,WAAW;AAEpB,SAAS,mCAAmC;AAC5C,SAAS,wBAAwB;AACjC,SAAS,oBAAwC;AAM1C,MAAe,wBAAwB,aAAa;AAO3D;AAMO,MAAe,cAAc;AAQpC;AAGO,MAAM,wBAAwB;AAG9B,MAAM,wBAAwB,aAAa;AAAA,EAChD;AAAA,EACA,OAAoB;AAAA,EACpB,oBAA8C;AAAA,EAC9C,kBAA2C;AAAA,EAC3C,gBAAsC;AAAA,EAEtC,uBAAuB;AAAA,EACvB;AAAA,EAEA,YAAY;AAAA,IACV;AAAA,IACA;AAAA,IACA;AAAA,IACA,yBAAyB;AAAA,EAC3B,GAKG;AACD,UAAM;AACN,SAAK,QAAQ;AACb,SAAK,WAAW;AAChB,SAAK,UAAU;AACf,SAAK,0BAA0B;AAAA,EACjC;AAAA,EAEA,eAAkD;AAAA,EAClD,oBAAkD;AAAA,EAClD,iBAAgC;AAAA,EAChC,eAAmC;AAAA,EACnC,gBAAqC;AAAA,EACrC,iBAA4C;AAAA,EAC5C,UAAU,IAAI;AAAA,EACd,WAAmC;AAAA,EACnC,UAA2C;AAAA,EAC3C,WAAwC;AAAA,EAExC,YAAqB;AAAA,EACrB,yBAAsC,oBAAI,IAAI;AAAA,EAC9C,aAAsB;AAAA,EAEtB,IAAI,SAA0C;AAC5C,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,OAAO,KAAsC;AAC/C,SAAK,UAAU;AACf,QAAI,KAAK,UAAU;AACjB,WAAK,SAAS,SAAS;AAAA,IACzB;AAAA,EACF;AAAA,EAEA,IAAI,wBAAqC;AACvC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,sBAAsB,OAAoB;AAC5C,SAAK,yBAAyB;AAC9B,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,IAAI,YAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,UAAU,YAAqB;AACjC,SAAK,aAAa;AAClB,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,IAAI,WAAoB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,SAAS,SAAkB;AAC7B,SAAK,YAAY;AACjB,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,MACE,MACA,cAAiD,MACvB;AAC1B,WAAO,IAAI,QAAQ,OAAO,SAAS,WAAW;AAhJlD;AAiJM,UAAI,KAAK,UAAU;AACjB,eAAO,IAAI,MAAM,iCAAiC,CAAC;AAAA,MACrD;AACA,WAAK,aAAa;AAElB,WAAK,GAAG,UAAU,sBAAsB,CAACA,iBAAmC;AAE1E,YAAI,KAAK,mBAAmB;AAC1B;AAAA,QACF;AACA,aAAK,iBAAiBA,aAAY,QAAS;AAAA,MAC7C,CAAC;AACD,WAAK;AAAA,QACH,UAAU;AAAA,QACV,CAAC,kBAA0CA,iBAAmC;AAC5E,cACE,KAAK,qBACLA,aAAY,aAAa,KAAK,kBAAkB,YAChD,iBAAiB,WAAW,YAAY,qBACxC,CAAC,iBAAiB,YAClB;AACA,6BAAiB,cAAc,IAAI;AAAA,UACrC;AAAA,QACF;AAAA,MACF;AACA,WAAK,GAAG,UAAU,iBAAiB,KAAK,yBAAyB,KAAK,IAAI,CAAC;AAE3E,WAAK,OAAO;AACZ,WAAK,eAAe;AAEpB,WAAK,eAAe,IAAI,YAAY,KAAK,MAAM,YAAY,KAAK,MAAM,WAAW;AACjF,WAAK,gBAAgB,IAAI;AAAA,QACvB,KAAK;AAAA,QACL,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,MACb;AACA,YAAM,mBAAmB,MAAM;AAC7B,aAAK,KAAK,wBAAwB;AAClC,aAAK,YAAY;AAAA,MACnB;AAEA,YAAM,mBAAmB,CAAC,gBAAyB;AACjD,aAAK,KAAK,wBAAwB;AAClC,aAAK,YAAY;AACjB,YAAI,KAAK,gBAAgB;AACvB,cAAI,OAAO,KAAK,eAAe,iBAAiB;AAChD,cAAI,aAAa;AACf,oBAAQ;AAAA,UACV;AACA,gBAAM,MAAM,IAAI,YAAY,OAAO;AAAA,YACjC,MAAM,IAAI,SAAS;AAAA,YACnB;AAAA,UACF,CAAC;AAED,cAAI,aAAa;AACf,iBAAK,KAAK,4BAA4B,GAAG;AAAA,UAC3C,OAAO;AACL,iBAAK,KAAK,0BAA0B,GAAG;AAAA,UACzC;AACA,eAAK,QAAQ,MAAM,EAAE,eAAe,MAAM,YAAY,CAAC,EAAE,MAAM,wBAAwB;AAAA,QACzF;AAAA,MACF;AAEA,WAAK,cAAc,GAAG,mBAAmB,gBAAgB;AACzD,WAAK,cAAc,GAAG,mBAAmB,gBAAgB;AAEzD,YAAM,QAAQ,gBAAgB,iBAAiB,mBAAmB,KAAK,YAAY;AACnF,YAAM,UAAU,IAAI,oBAAoB;AACxC,cAAQ,SAAS,YAAY;AAC7B,WAAK,oBAAqB,QAAM,UAAK,qBAAL,mBAAuB,aAAa,OAAO,aAAa;AACxF,UAAI,CAAC,KAAK,mBAAmB;AAC3B,aAAK,QAAQ,MAAM,yBAAyB;AAC5C,eAAO,IAAI,MAAM,yBAAyB,CAAC;AAC3C;AAAA,MACF;AAEA,YAAM,KAAK,kBAAkB,oBAAoB;AAEjD,UAAI,aAAa;AACf,YAAI,OAAO,gBAAgB,UAAU;AACnC,eAAK,iBAAiB,WAAW;AAAA,QACnC,OAAO;AACL,eAAK,iBAAiB,YAAY,QAAS;AAAA,QAC7C;AAAA,MACF,OAAO;AAEL,mBAAWA,gBAAe,KAAK,mBAAmB,OAAO,GAAG;AAC1D,eAAK,iBAAiBA,aAAY,QAAS;AAC3C;AAAA,QACF;AAAA,MACF;AAEA,WAAK,WAAW,KAAK,MAAM,QAAQ,EAAE,QAAQ,KAAK,SAAS,SAAS,KAAK,SAAS,CAAC;AACnF,WAAK,WAAW;AAGhB,WAAK,SAAS,GAAG,0BAA0B,CAAC,YAAiB;AAnPnE,YAAAC;AAqPQ,YAAI,QAAQ,gBAAgB,OAAQ;AAEpC,cAAM,QAAQ,IAAI;AAAA,UAChB,KAAK;AAAA,UACL,KAAK,KAAM,iBAAkB;AAAA,UAC7B,KAAK,kBAAkB;AAAA,UACvB,QAAQ;AAAA,QACV;AAEA,cAAM,UAASA,MAAA,KAAK,kBAAL,gBAAAA,IAAoB;AAAA,UACjC,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA,QAAQ;AAAA,UACR,QAAQ;AAAA;AAEV,aAAK,iBAAiB;AAAA,MACxB,CAAC;AAGD,WAAK,SAAS,GAAG,yBAAyB,CAAC,YAAiB;AAE1D,YAAI,QAAQ,gBAAgB,QAAQ;AAClC,cAAI,KAAK,wBAAwB,KAAK,yBAAyB;AAC7D,kBAAM,IAAI;AAAA,cACR,0DACW,KAAK,uBAAuB;AAAA,YAGzC;AAAA,UACF;AAEA,eAAK;AACL,eAAK,QACF,MAAM;AAAA,YACL,QAAQ,QAAQ;AAAA,YAChB,MAAM,QAAQ;AAAA,YACd,SAAS,KAAK;AAAA,UAChB,CAAC,EACA;AAAA,YACC;AAAA,UAEF;AACF,eAAK,SAAU,wBAAwB,QAAQ,MAAM;AAAA,QACvD,OAAO;AACL,eAAK,uBAAuB;AAAA,QAC9B;AAAA,MACF,CAAC;AAGD,WAAK,SAAS,GAAG,0BAA0B,CAAC,OAAY;AAvS9D,YAAAA,KAAA;AAySQ,cAAM,uBAAsBA,MAAA,KAAK,sBAAL,gBAAAA,IAAwB;AACpD,cAAM,YAAW,UAAK,oBAAL,mBAAsB;AACvC,YAAI,uBAAuB,UAAU;AACnC,eAAK,sBAAsB,qBAAqB,UAAU,UAAK,OAAO,GAAG,MAAM;AAAA,QACjF,OAAO;AACL,eAAK,QAAQ,MAAM,8BAA8B;AAAA,QACnD;AAAA,MACF,CAAC;AAGD,WAAK,SAAS,GAAG,wCAAwC,CAAC,OAAY;AAnT5E,YAAAA,KAAA;AAqTQ,cAAM,gBAAgB,GAAG;AACzB,cAAM,uBAAsBA,MAAA,KAAK,sBAAL,gBAAAA,IAAwB;AACpD,cAAM,YAAW,UAAK,oBAAL,mBAAsB;AACvC,YAAI,uBAAuB,UAAU;AACnC,eAAK,sBAAsB,qBAAqB,UAAU,eAAe,MAAM,GAAG,MAAM;AAAA,QAC1F,OAAO;AACL,eAAK,QAAQ,MAAM,8BAA8B;AAAA,QACnD;AACA,cAAM,UAAU,IAAI,YAAY,OAAO;AAAA,UACrC,MAAM,IAAI,SAAS;AAAA,UACnB,MAAM;AAAA,QACR,CAAC;AACD,aAAK,KAAK,yBAAyB,OAAO;AAC1C,aAAK,QAAQ,MAAM,EAAE,cAAc,CAAC,EAAE,MAAM,uBAAuB;AAAA,MACrE,CAAC;AAED,WAAK,SAAS,GAAG,wBAAwB,CAAC,OAAY;AArU5D,YAAAA,KAAA;AAsUQ,aAAK,KAAK,uBAAuB;AACjC,YAAI,KAAK,kBAAkB,CAAC,KAAK,eAAe,MAAM;AACpD,eAAK,eAAe,UAAU;AAE9B,eAAK,SAAU,aAAa,KAAK;AAAA,YAC/B,KAAK,eAAe;AAAA,YACpB,KAAK,eAAe;AAAA,YACpB,KAAK,MAAO,KAAK,eAAe,eAAe,OAAS,GAAI;AAAA,UAC9D;AAEA,eAAK,iBAAiB;AAAA,QACxB;AAEA,cAAM,uBAAsBA,MAAA,KAAK,sBAAL,gBAAAA,IAAwB;AACpD,cAAM,YAAW,UAAK,oBAAL,mBAAsB;AACvC,YAAI,uBAAuB,UAAU;AACnC,eAAK,sBAAsB,qBAAqB,UAAU,UAAK,OAAO,GAAG,MAAM;AAAA,QACjF;AAAA,MACF,CAAC;AAGD,WAAK,SAAS,GAAG,wBAAwB,CAAC,OAAY;AACpD,aAAK,KAAK,uBAAuB;AAAA,MACnC,CAAC;AAGD,WAAK,SAAS,GAAG,yBAAyB,CAAC,OAAY;AACrD,aAAK,sBAAsB,IAAI,GAAG,MAAM;AACxC,aAAK,aAAa;AAAA,MACpB,CAAC;AAGD,WAAK,SAAS,GAAG,2BAA2B,CAAC,OAAY;AACvD,aAAK,sBAAsB,OAAO,GAAG,MAAM;AAC3C,aAAK,aAAa;AAAA,MACpB,CAAC;AAGD,WAAK,SAAS,GAAG,wBAAwB,CAAC,OAAY;AACpD,aAAK,sBAAsB,OAAO,GAAG,MAAM;AAC3C,aAAK,aAAa;AAAA,MACpB,CAAC;AAED,WAAK,SAAS,GAAG,qBAAqB,CAAC,YAAkC;AACvE,aAAK,KAAK,qBAAqB,OAAO;AAAA,MACxC,CAAC;AAED,cAAQ,KAAK,QAAQ;AAAA,IACvB,CAAC;AAAA,EACH;AAAA,EAEA,iBAAiB,qBAAmC;AAClD,QAAI,CAAC,KAAK,MAAM;AACd,WAAK,QAAQ,MAAM,iBAAiB;AACpC;AAAA,IACF;AAEA,SAAK,oBAAoB,KAAK,KAAK,mBAAmB,IAAI,mBAAmB,KAAK;AAClF,QAAI,CAAC,KAAK,mBAAmB;AAC3B,WAAK,QAAQ,MAAM,6BAA6B,mBAAmB,YAAY;AAC/E;AAAA,IACF;AAEA,QAAI,KAAK,kBAAkB,kBAAkB,OAAO,GAAG;AACrD,WAAK,uBAAuB;AAAA,IAC9B;AAGA,eAAW,eAAe,KAAK,kBAAkB,kBAAkB,OAAO,GAAG;AAC3E,UAAI,YAAY,WAAW,YAAY,qBAAqB,YAAY,OAAO;AAC7E,aAAK,yBAAyB,YAAY,OAAO,aAAa,KAAK,iBAAiB;AACpF;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,yBAA+B;AAC7B,QAAI,CAAC,KAAK,mBAAmB;AAC3B,WAAK,QAAQ,MAAM,wBAAwB;AAC3C;AAAA,IACF;AAEA,QAAI,wBAA4D;AAChE,eAAW,eAAe,KAAK,kBAAkB,kBAAkB,OAAO,GAAG;AAC3E,UAAI,YAAY,WAAW,YAAY,mBAAmB;AACxD,gCAAwB;AACxB;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,uBAAuB;AAC1B;AAAA,IACF;AAEA,QAAI,CAAC,sBAAsB,YAAY;AACrC,4BAAsB,cAAc,IAAI;AAAA,IAC1C;AAAA,EACF;AAAA,EAEA,yBACE,OACA,aACA,aACA;AA5aJ;AA6aI,QACE,YAAY,WAAW,YAAY,qBACnC,YAAY,eAAa,UAAK,sBAAL,mBAAwB,WACjD;AACA;AAAA,IACF;AACA,UAAM,sBAAsB,OAAO,gBAA6B;AAC9D,YAAM,UAAU,IAAI;AAAA,QAClB,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,MACb;AAEA,uBAAiB,SAAS,aAAa;AACrC,cAAM,YAAY,MAAM;AACxB,mBAAWC,UAAS,QAAQ,MAAM,UAAU,MAAM,GAAG;AACnD,eAAK,SAAU,iBAAiB,OAAOA,MAAK;AAAA,QAC9C;AAAA,MACF;AAAA,IACF;AACA,SAAK,kBAAkB;AAEvB,SAAK,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC1D,0BAAoB,IAAI,YAAY,OAAO,KAAK,MAAM,YAAY,KAAK,MAAM,WAAW,CAAC,EACtF,KAAK,OAAO,EACZ,MAAM,MAAM;AAAA,IACjB,CAAC;AAAA,EACH;AAAA,EAEA,oBAAmC;AACjC,QAAI,CAAC,KAAK,kBAAkB,KAAK,QAAQ,KAAK,KAAK,kBAAkB;AACnE,WAAK,iBAAiB,iBAAiB,KAAK,MAAM,KAAK,KAAK,iBAAkB,QAAS;AAAA,IACzF;AACA,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,sBACE,qBACA,UACA,MACA,SACA,IACM;AAvdV;AAwdI,SAAK,QAAQ;AAAA,MACX,4BAA4B,mBAAmB,IAAI,QAAQ,IAAI,IAAI,IAAI,OAAO,IAAI,EAAE;AAAA,IACtF;AACA,QAAI,GAAC,UAAK,SAAL,mBAAW,mBAAkB;AAChC,WAAK,QAAQ,MAAM,mCAAmC;AACtD;AAAA,IACF;AAEA,SAAK,KAAK,iBAAiB,qBAAqB;AAAA,MAC9C;AAAA,MACA;AAAA,MACA,UAAU;AAAA,QACR;AAAA,UACE;AAAA,UACA,OAAO;AAAA,UACP;AAAA,UACA,WAAW,OAAO,CAAC;AAAA,UACnB,SAAS,OAAO,CAAC;AAAA,UACjB,UAAU;AAAA,QACZ;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,eAAe;AACb,QAAI,WAAuB;AAC3B,QAAI,KAAK,sBAAsB,OAAO,GAAG;AACvC,iBAAW;AAAA,IACb,WAAW,KAAK,WAAW;AACzB,iBAAW;AAAA,IACb,WAAW,KAAK,UAAU;AACxB,iBAAW;AAAA,IACb;AAEA,SAAK,UAAU,QAAQ;AAAA,EACzB;AAAA,EAEA,UAAU,OAAmB;AA7f/B;AA8fI,UAAI,UAAK,SAAL,mBAAW,gBAAe,KAAK,KAAK,kBAAkB;AACxD,YAAM,eAAe,KAAK,KAAK,iBAAiB,WAAY,qBAAqB;AACjF,UAAI,iBAAiB,OAAO;AAC1B,aAAK,KAAK,iBAAiB,cAAc;AAAA,UACvC,CAAC,qBAAqB,GAAG;AAAA,QAC3B,CAAC;AACD,aAAK,QAAQ,MAAM,GAAG,qBAAqB,KAAK,YAAY,MAAM,KAAK,EAAE;AAAA,MAC3E;AAAA,IACF;AAAA,EACF;AACF;","names":["participant","_a","frame"]}
1
+ {"version":3,"sources":["../../src/multimodal/multimodal_agent.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type {\n LocalTrackPublication,\n RemoteAudioTrack,\n RemoteParticipant,\n RemoteTrack,\n RemoteTrackPublication,\n Room,\n} from '@livekit/rtc-node';\nimport {\n AudioSource,\n AudioStream,\n LocalAudioTrack,\n RoomEvent,\n TrackPublishOptions,\n TrackSource,\n} from '@livekit/rtc-node';\nimport { EventEmitter } from 'node:events';\nimport { AudioByteStream } from '../audio.js';\nimport * as llm from '../llm/index.js';\nimport { log } from '../log.js';\nimport type { MultimodalLLMMetrics } from '../metrics/base.js';\nimport { TextAudioSynchronizer, defaultTextSyncOptions } from '../transcription.js';\nimport { findMicroTrackId } from '../utils.js';\nimport { AgentPlayout, type PlayoutHandle } from './agent_playout.js';\n\n/**\n * @internal\n * @beta\n */\nexport abstract class RealtimeSession extends EventEmitter {\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n abstract conversation: any; // openai.realtime.Conversation\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n abstract inputAudioBuffer: any; // openai.realtime.InputAudioBuffer\n abstract fncCtx: llm.FunctionContext | undefined;\n abstract recoverFromTextResponse(itemId: string): void;\n}\n\n/**\n * @internal\n * @beta\n */\nexport abstract class RealtimeModel {\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n abstract session(options: any): RealtimeSession; // openai.realtime.ModelOptions\n abstract close(): Promise<void>;\n abstract sampleRate: number;\n abstract numChannels: number;\n abstract inFrameSize: number;\n abstract outFrameSize: number;\n}\n\nexport type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';\nexport const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';\n\n/** @beta */\nexport class MultimodalAgent extends EventEmitter {\n model: RealtimeModel;\n room: Room | null = null;\n linkedParticipant: RemoteParticipant | null = null;\n subscribedTrack: RemoteAudioTrack | null = null;\n readMicroTask: Promise<void> | null = null;\n\n #textResponseRetries = 0;\n #maxTextResponseRetries: number;\n\n constructor({\n model,\n chatCtx,\n fncCtx,\n maxTextResponseRetries = 5,\n }: {\n model: RealtimeModel;\n chatCtx?: llm.ChatContext;\n fncCtx?: llm.FunctionContext;\n maxTextResponseRetries?: number;\n }) {\n super();\n this.model = model;\n this.#chatCtx = chatCtx;\n this.#fncCtx = fncCtx;\n this.#maxTextResponseRetries = maxTextResponseRetries;\n }\n\n #participant: RemoteParticipant | string | null = null;\n #agentPublication: LocalTrackPublication | null = null;\n #localTrackSid: string | null = null;\n #localSource: AudioSource | null = null;\n #agentPlayout: AgentPlayout | null = null;\n #playingHandle: PlayoutHandle | undefined = undefined;\n #logger = log();\n #session: RealtimeSession | null = null;\n #fncCtx: llm.FunctionContext | undefined = undefined;\n #chatCtx: llm.ChatContext | undefined = undefined;\n\n #_started: boolean = false;\n #_pendingFunctionCalls: Set<string> = new Set();\n #_speaking: boolean = false;\n\n get fncCtx(): llm.FunctionContext | undefined {\n return this.#fncCtx;\n }\n\n set fncCtx(ctx: llm.FunctionContext | undefined) {\n this.#fncCtx = ctx;\n if (this.#session) {\n this.#session.fncCtx = ctx;\n }\n }\n\n get #pendingFunctionCalls(): Set<string> {\n return this.#_pendingFunctionCalls;\n }\n\n set #pendingFunctionCalls(calls: Set<string>) {\n this.#_pendingFunctionCalls = calls;\n this.#updateState();\n }\n\n get #speaking(): boolean {\n return this.#_speaking;\n }\n\n set #speaking(isSpeaking: boolean) {\n this.#_speaking = isSpeaking;\n this.#updateState();\n }\n\n get #started(): boolean {\n return this.#_started;\n }\n\n set #started(started: boolean) {\n this.#_started = started;\n this.#updateState();\n }\n\n start(\n room: Room,\n participant: RemoteParticipant | string | null = null,\n ): Promise<RealtimeSession> {\n return new Promise(async (resolve, reject) => {\n if (this.#started) {\n reject(new Error('MultimodalAgent already started'));\n }\n this.#updateState();\n\n room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {\n // automatically link to the first participant that connects, if not already linked\n if (this.linkedParticipant) {\n return;\n }\n this.#linkParticipant(participant.identity!);\n });\n room.on(\n RoomEvent.TrackPublished,\n (trackPublication: RemoteTrackPublication, participant: RemoteParticipant) => {\n if (\n this.linkedParticipant &&\n participant.identity === this.linkedParticipant.identity &&\n trackPublication.source === TrackSource.SOURCE_MICROPHONE &&\n !trackPublication.subscribed\n ) {\n trackPublication.setSubscribed(true);\n }\n },\n );\n room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));\n\n this.room = room;\n this.#participant = participant;\n\n this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);\n this.#agentPlayout = new AgentPlayout(\n this.#localSource,\n this.model.sampleRate,\n this.model.numChannels,\n this.model.inFrameSize,\n this.model.outFrameSize,\n );\n const onPlayoutStarted = () => {\n this.emit('agent_started_speaking');\n this.#speaking = true;\n };\n\n const onPlayoutStopped = (interrupted: boolean) => {\n this.emit('agent_stopped_speaking');\n this.#speaking = false;\n if (this.#playingHandle) {\n let text = this.#playingHandle.synchronizer.playedText;\n if (interrupted) {\n text += '…';\n }\n const msg = llm.ChatMessage.create({\n role: llm.ChatRole.ASSISTANT,\n text,\n });\n\n if (interrupted) {\n this.emit('agent_speech_interrupted', msg);\n } else {\n this.emit('agent_speech_committed', msg);\n }\n this.#logger.child({ transcription: text, interrupted }).debug('committed agent speech');\n }\n };\n\n this.#agentPlayout.on('playout_started', onPlayoutStarted);\n this.#agentPlayout.on('playout_stopped', onPlayoutStopped);\n\n const track = LocalAudioTrack.createAudioTrack('assistant_voice', this.#localSource);\n const options = new TrackPublishOptions();\n options.source = TrackSource.SOURCE_MICROPHONE;\n this.#agentPublication = (await room.localParticipant?.publishTrack(track, options)) || null;\n if (!this.#agentPublication) {\n this.#logger.error('Failed to publish track');\n reject(new Error('Failed to publish track'));\n return;\n }\n\n await this.#agentPublication.waitForSubscription();\n\n if (participant) {\n if (typeof participant === 'string') {\n this.#linkParticipant(participant);\n } else {\n this.#linkParticipant(participant.identity!);\n }\n } else {\n // No participant specified, try to find the first participant in the room\n for (const participant of room.remoteParticipants.values()) {\n this.#linkParticipant(participant.identity!);\n break;\n }\n }\n\n this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });\n this.#started = true;\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('response_content_added', (message: any) => {\n // openai.realtime.RealtimeContent\n if (message.contentType === 'text') return;\n\n const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);\n synchronizer.on('textUpdated', (text) => {\n this.#publishTranscription(\n this.room!.localParticipant!.identity!,\n this.#getLocalTrackSid()!,\n text.text,\n text.final,\n text.id,\n );\n });\n\n const handle = this.#agentPlayout?.play(\n message.itemId,\n message.contentIndex,\n synchronizer,\n message.textStream,\n message.audioStream,\n );\n this.#playingHandle = handle;\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('response_content_done', (message: any) => {\n // openai.realtime.RealtimeContent\n if (message.contentType === 'text') {\n if (this.#textResponseRetries >= this.#maxTextResponseRetries) {\n throw new Error(\n 'The OpenAI Realtime API returned a text response ' +\n `after ${this.#maxTextResponseRetries} retries. ` +\n 'Please try to reduce the number of text system or ' +\n 'assistant messages in the chat context.',\n );\n }\n\n this.#textResponseRetries++;\n this.#logger\n .child({\n itemId: message.itemId,\n text: message.text,\n retries: this.#textResponseRetries,\n })\n .warn(\n 'The OpenAI Realtime API returned a text response instead of audio. ' +\n 'Attempting to recover to audio mode...',\n );\n this.#session!.recoverFromTextResponse(message.itemId);\n } else {\n this.#textResponseRetries = 0;\n }\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('input_speech_committed', (ev: any) => {\n // openai.realtime.InputSpeechCommittedEvent\n const participantIdentity = this.linkedParticipant?.identity;\n const trackSid = this.subscribedTrack?.sid;\n if (participantIdentity && trackSid) {\n this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);\n } else {\n this.#logger.error('Participant or track not set');\n }\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('input_speech_transcription_completed', (ev: any) => {\n // openai.realtime.InputSpeechTranscriptionCompletedEvent\n const transcription = ev.transcript;\n const participantIdentity = this.linkedParticipant?.identity;\n const trackSid = this.subscribedTrack?.sid;\n if (participantIdentity && trackSid) {\n this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);\n } else {\n this.#logger.error('Participant or track not set');\n }\n const userMsg = llm.ChatMessage.create({\n role: llm.ChatRole.USER,\n text: transcription,\n });\n this.emit('user_speech_committed', userMsg);\n this.#logger.child({ transcription }).debug('committed user speech');\n });\n\n this.#session.on('input_speech_started', (ev: any) => {\n this.emit('user_started_speaking');\n if (this.#playingHandle && !this.#playingHandle.done) {\n this.#playingHandle.interrupt();\n\n this.#session!.conversation.item.truncate(\n this.#playingHandle.itemId,\n this.#playingHandle.contentIndex,\n Math.floor((this.#playingHandle.audioSamples / 24000) * 1000),\n );\n\n this.#playingHandle = undefined;\n }\n\n const participantIdentity = this.linkedParticipant?.identity;\n const trackSid = this.subscribedTrack?.sid;\n if (participantIdentity && trackSid) {\n this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);\n }\n });\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n this.#session.on('input_speech_stopped', (ev: any) => {\n this.emit('user_stopped_speaking');\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('function_call_started', (ev: any) => {\n this.#pendingFunctionCalls.add(ev.callId);\n this.#updateState();\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('function_call_completed', (ev: any) => {\n this.#pendingFunctionCalls.delete(ev.callId);\n this.#updateState();\n });\n\n // eslint-disable-next-line @typescript-eslint/no-explicit-any\n this.#session.on('function_call_failed', (ev: any) => {\n this.#pendingFunctionCalls.delete(ev.callId);\n this.#updateState();\n });\n\n this.#session.on('metrics_collected', (metrics: MultimodalLLMMetrics) => {\n this.emit('metrics_collected', metrics);\n });\n\n resolve(this.#session);\n });\n }\n\n #linkParticipant(participantIdentity: string): void {\n if (!this.room) {\n this.#logger.error('Room is not set');\n return;\n }\n\n this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;\n if (!this.linkedParticipant) {\n this.#logger.error(`Participant with identity ${participantIdentity} not found`);\n return;\n }\n\n if (this.linkedParticipant.trackPublications.size > 0) {\n this.#subscribeToMicrophone();\n }\n\n // also check if already subscribed\n for (const publication of this.linkedParticipant.trackPublications.values()) {\n if (publication.source === TrackSource.SOURCE_MICROPHONE && publication.track) {\n this.#handleTrackSubscription(publication.track, publication, this.linkedParticipant);\n break;\n }\n }\n }\n\n #subscribeToMicrophone(): void {\n if (!this.linkedParticipant) {\n this.#logger.error('Participant is not set');\n return;\n }\n\n let microphonePublication: RemoteTrackPublication | undefined = undefined;\n for (const publication of this.linkedParticipant.trackPublications.values()) {\n if (publication.source === TrackSource.SOURCE_MICROPHONE) {\n microphonePublication = publication;\n break;\n }\n }\n if (!microphonePublication) {\n return;\n }\n\n if (!microphonePublication.subscribed) {\n microphonePublication.setSubscribed(true);\n }\n }\n\n #handleTrackSubscription(\n track: RemoteTrack,\n publication: RemoteTrackPublication,\n participant: RemoteParticipant,\n ) {\n if (\n publication.source !== TrackSource.SOURCE_MICROPHONE ||\n participant.identity !== this.linkedParticipant?.identity\n ) {\n return;\n }\n const readAudioStreamTask = async (audioStream: AudioStream) => {\n const bstream = new AudioByteStream(\n this.model.sampleRate,\n this.model.numChannels,\n this.model.inFrameSize,\n );\n\n for await (const frame of audioStream) {\n const audioData = frame.data;\n for (const frame of bstream.write(audioData.buffer)) {\n this.#session!.inputAudioBuffer.append(frame);\n }\n }\n };\n this.subscribedTrack = track;\n\n this.readMicroTask = new Promise<void>((resolve, reject) => {\n readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))\n .then(resolve)\n .catch(reject);\n });\n }\n\n #getLocalTrackSid(): string | null {\n if (!this.#localTrackSid && this.room && this.room.localParticipant) {\n this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant!.identity!);\n }\n return this.#localTrackSid;\n }\n\n #publishTranscription(\n participantIdentity: string,\n trackSid: string,\n text: string,\n isFinal: boolean,\n id: string,\n ): void {\n this.#logger.debug(\n `Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`,\n );\n if (!this.room?.localParticipant) {\n this.#logger.error('Room or local participant not set');\n return;\n }\n\n this.room.localParticipant.publishTranscription({\n participantIdentity,\n trackSid,\n segments: [\n {\n text,\n final: isFinal,\n id,\n startTime: BigInt(0),\n endTime: BigInt(0),\n language: '',\n },\n ],\n });\n }\n\n #updateState() {\n let newState: AgentState = 'initializing';\n if (this.#pendingFunctionCalls.size > 0) {\n newState = 'thinking';\n } else if (this.#speaking) {\n newState = 'speaking';\n } else if (this.#started) {\n newState = 'listening';\n }\n\n this.#setState(newState);\n }\n\n #setState(state: AgentState) {\n if (this.room?.isConnected && this.room.localParticipant) {\n const currentState = this.room.localParticipant.attributes![AGENT_STATE_ATTRIBUTE];\n if (currentState !== state) {\n this.room.localParticipant.setAttributes({\n [AGENT_STATE_ATTRIBUTE]: state,\n });\n this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);\n }\n }\n }\n}\n"],"mappings":"AAWA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP,SAAS,oBAAoB;AAC7B,SAAS,uBAAuB;AAChC,YAAY,SAAS;AACrB,SAAS,WAAW;AAEpB,SAAS,uBAAuB,8BAA8B;AAC9D,SAAS,wBAAwB;AACjC,SAAS,oBAAwC;AAM1C,MAAe,wBAAwB,aAAa;AAO3D;AAMO,MAAe,cAAc;AAQpC;AAGO,MAAM,wBAAwB;AAG9B,MAAM,wBAAwB,aAAa;AAAA,EAChD;AAAA,EACA,OAAoB;AAAA,EACpB,oBAA8C;AAAA,EAC9C,kBAA2C;AAAA,EAC3C,gBAAsC;AAAA,EAEtC,uBAAuB;AAAA,EACvB;AAAA,EAEA,YAAY;AAAA,IACV;AAAA,IACA;AAAA,IACA;AAAA,IACA,yBAAyB;AAAA,EAC3B,GAKG;AACD,UAAM;AACN,SAAK,QAAQ;AACb,SAAK,WAAW;AAChB,SAAK,UAAU;AACf,SAAK,0BAA0B;AAAA,EACjC;AAAA,EAEA,eAAkD;AAAA,EAClD,oBAAkD;AAAA,EAClD,iBAAgC;AAAA,EAChC,eAAmC;AAAA,EACnC,gBAAqC;AAAA,EACrC,iBAA4C;AAAA,EAC5C,UAAU,IAAI;AAAA,EACd,WAAmC;AAAA,EACnC,UAA2C;AAAA,EAC3C,WAAwC;AAAA,EAExC,YAAqB;AAAA,EACrB,yBAAsC,oBAAI,IAAI;AAAA,EAC9C,aAAsB;AAAA,EAEtB,IAAI,SAA0C;AAC5C,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,OAAO,KAAsC;AAC/C,SAAK,UAAU;AACf,QAAI,KAAK,UAAU;AACjB,WAAK,SAAS,SAAS;AAAA,IACzB;AAAA,EACF;AAAA,EAEA,IAAI,wBAAqC;AACvC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,sBAAsB,OAAoB;AAC5C,SAAK,yBAAyB;AAC9B,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,IAAI,YAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,UAAU,YAAqB;AACjC,SAAK,aAAa;AAClB,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,IAAI,WAAoB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,SAAS,SAAkB;AAC7B,SAAK,YAAY;AACjB,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,MACE,MACA,cAAiD,MACvB;AAC1B,WAAO,IAAI,QAAQ,OAAO,SAAS,WAAW;AAhJlD;AAiJM,UAAI,KAAK,UAAU;AACjB,eAAO,IAAI,MAAM,iCAAiC,CAAC;AAAA,MACrD;AACA,WAAK,aAAa;AAElB,WAAK,GAAG,UAAU,sBAAsB,CAACA,iBAAmC;AAE1E,YAAI,KAAK,mBAAmB;AAC1B;AAAA,QACF;AACA,aAAK,iBAAiBA,aAAY,QAAS;AAAA,MAC7C,CAAC;AACD,WAAK;AAAA,QACH,UAAU;AAAA,QACV,CAAC,kBAA0CA,iBAAmC;AAC5E,cACE,KAAK,qBACLA,aAAY,aAAa,KAAK,kBAAkB,YAChD,iBAAiB,WAAW,YAAY,qBACxC,CAAC,iBAAiB,YAClB;AACA,6BAAiB,cAAc,IAAI;AAAA,UACrC;AAAA,QACF;AAAA,MACF;AACA,WAAK,GAAG,UAAU,iBAAiB,KAAK,yBAAyB,KAAK,IAAI,CAAC;AAE3E,WAAK,OAAO;AACZ,WAAK,eAAe;AAEpB,WAAK,eAAe,IAAI,YAAY,KAAK,MAAM,YAAY,KAAK,MAAM,WAAW;AACjF,WAAK,gBAAgB,IAAI;AAAA,QACvB,KAAK;AAAA,QACL,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,MACb;AACA,YAAM,mBAAmB,MAAM;AAC7B,aAAK,KAAK,wBAAwB;AAClC,aAAK,YAAY;AAAA,MACnB;AAEA,YAAM,mBAAmB,CAAC,gBAAyB;AACjD,aAAK,KAAK,wBAAwB;AAClC,aAAK,YAAY;AACjB,YAAI,KAAK,gBAAgB;AACvB,cAAI,OAAO,KAAK,eAAe,aAAa;AAC5C,cAAI,aAAa;AACf,oBAAQ;AAAA,UACV;AACA,gBAAM,MAAM,IAAI,YAAY,OAAO;AAAA,YACjC,MAAM,IAAI,SAAS;AAAA,YACnB;AAAA,UACF,CAAC;AAED,cAAI,aAAa;AACf,iBAAK,KAAK,4BAA4B,GAAG;AAAA,UAC3C,OAAO;AACL,iBAAK,KAAK,0BAA0B,GAAG;AAAA,UACzC;AACA,eAAK,QAAQ,MAAM,EAAE,eAAe,MAAM,YAAY,CAAC,EAAE,MAAM,wBAAwB;AAAA,QACzF;AAAA,MACF;AAEA,WAAK,cAAc,GAAG,mBAAmB,gBAAgB;AACzD,WAAK,cAAc,GAAG,mBAAmB,gBAAgB;AAEzD,YAAM,QAAQ,gBAAgB,iBAAiB,mBAAmB,KAAK,YAAY;AACnF,YAAM,UAAU,IAAI,oBAAoB;AACxC,cAAQ,SAAS,YAAY;AAC7B,WAAK,oBAAqB,QAAM,UAAK,qBAAL,mBAAuB,aAAa,OAAO,aAAa;AACxF,UAAI,CAAC,KAAK,mBAAmB;AAC3B,aAAK,QAAQ,MAAM,yBAAyB;AAC5C,eAAO,IAAI,MAAM,yBAAyB,CAAC;AAC3C;AAAA,MACF;AAEA,YAAM,KAAK,kBAAkB,oBAAoB;AAEjD,UAAI,aAAa;AACf,YAAI,OAAO,gBAAgB,UAAU;AACnC,eAAK,iBAAiB,WAAW;AAAA,QACnC,OAAO;AACL,eAAK,iBAAiB,YAAY,QAAS;AAAA,QAC7C;AAAA,MACF,OAAO;AAEL,mBAAWA,gBAAe,KAAK,mBAAmB,OAAO,GAAG;AAC1D,eAAK,iBAAiBA,aAAY,QAAS;AAC3C;AAAA,QACF;AAAA,MACF;AAEA,WAAK,WAAW,KAAK,MAAM,QAAQ,EAAE,QAAQ,KAAK,SAAS,SAAS,KAAK,SAAS,CAAC;AACnF,WAAK,WAAW;AAGhB,WAAK,SAAS,GAAG,0BAA0B,CAAC,YAAiB;AAnPnE,YAAAC;AAqPQ,YAAI,QAAQ,gBAAgB,OAAQ;AAEpC,cAAM,eAAe,IAAI,sBAAsB,sBAAsB;AACrE,qBAAa,GAAG,eAAe,CAAC,SAAS;AACvC,eAAK;AAAA,YACH,KAAK,KAAM,iBAAkB;AAAA,YAC7B,KAAK,kBAAkB;AAAA,YACvB,KAAK;AAAA,YACL,KAAK;AAAA,YACL,KAAK;AAAA,UACP;AAAA,QACF,CAAC;AAED,cAAM,UAASA,MAAA,KAAK,kBAAL,gBAAAA,IAAoB;AAAA,UACjC,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA,QAAQ;AAAA,UACR,QAAQ;AAAA;AAEV,aAAK,iBAAiB;AAAA,MACxB,CAAC;AAGD,WAAK,SAAS,GAAG,yBAAyB,CAAC,YAAiB;AAE1D,YAAI,QAAQ,gBAAgB,QAAQ;AAClC,cAAI,KAAK,wBAAwB,KAAK,yBAAyB;AAC7D,kBAAM,IAAI;AAAA,cACR,0DACW,KAAK,uBAAuB;AAAA,YAGzC;AAAA,UACF;AAEA,eAAK;AACL,eAAK,QACF,MAAM;AAAA,YACL,QAAQ,QAAQ;AAAA,YAChB,MAAM,QAAQ;AAAA,YACd,SAAS,KAAK;AAAA,UAChB,CAAC,EACA;AAAA,YACC;AAAA,UAEF;AACF,eAAK,SAAU,wBAAwB,QAAQ,MAAM;AAAA,QACvD,OAAO;AACL,eAAK,uBAAuB;AAAA,QAC9B;AAAA,MACF,CAAC;AAGD,WAAK,SAAS,GAAG,0BAA0B,CAAC,OAAY;AA3S9D,YAAAA,KAAA;AA6SQ,cAAM,uBAAsBA,MAAA,KAAK,sBAAL,gBAAAA,IAAwB;AACpD,cAAM,YAAW,UAAK,oBAAL,mBAAsB;AACvC,YAAI,uBAAuB,UAAU;AACnC,eAAK,sBAAsB,qBAAqB,UAAU,UAAK,OAAO,GAAG,MAAM;AAAA,QACjF,OAAO;AACL,eAAK,QAAQ,MAAM,8BAA8B;AAAA,QACnD;AAAA,MACF,CAAC;AAGD,WAAK,SAAS,GAAG,wCAAwC,CAAC,OAAY;AAvT5E,YAAAA,KAAA;AAyTQ,cAAM,gBAAgB,GAAG;AACzB,cAAM,uBAAsBA,MAAA,KAAK,sBAAL,gBAAAA,IAAwB;AACpD,cAAM,YAAW,UAAK,oBAAL,mBAAsB;AACvC,YAAI,uBAAuB,UAAU;AACnC,eAAK,sBAAsB,qBAAqB,UAAU,eAAe,MAAM,GAAG,MAAM;AAAA,QAC1F,OAAO;AACL,eAAK,QAAQ,MAAM,8BAA8B;AAAA,QACnD;AACA,cAAM,UAAU,IAAI,YAAY,OAAO;AAAA,UACrC,MAAM,IAAI,SAAS;AAAA,UACnB,MAAM;AAAA,QACR,CAAC;AACD,aAAK,KAAK,yBAAyB,OAAO;AAC1C,aAAK,QAAQ,MAAM,EAAE,cAAc,CAAC,EAAE,MAAM,uBAAuB;AAAA,MACrE,CAAC;AAED,WAAK,SAAS,GAAG,wBAAwB,CAAC,OAAY;AAzU5D,YAAAA,KAAA;AA0UQ,aAAK,KAAK,uBAAuB;AACjC,YAAI,KAAK,kBAAkB,CAAC,KAAK,eAAe,MAAM;AACpD,eAAK,eAAe,UAAU;AAE9B,eAAK,SAAU,aAAa,KAAK;AAAA,YAC/B,KAAK,eAAe;AAAA,YACpB,KAAK,eAAe;AAAA,YACpB,KAAK,MAAO,KAAK,eAAe,eAAe,OAAS,GAAI;AAAA,UAC9D;AAEA,eAAK,iBAAiB;AAAA,QACxB;AAEA,cAAM,uBAAsBA,MAAA,KAAK,sBAAL,gBAAAA,IAAwB;AACpD,cAAM,YAAW,UAAK,oBAAL,mBAAsB;AACvC,YAAI,uBAAuB,UAAU;AACnC,eAAK,sBAAsB,qBAAqB,UAAU,UAAK,OAAO,GAAG,MAAM;AAAA,QACjF;AAAA,MACF,CAAC;AAGD,WAAK,SAAS,GAAG,wBAAwB,CAAC,OAAY;AACpD,aAAK,KAAK,uBAAuB;AAAA,MACnC,CAAC;AAGD,WAAK,SAAS,GAAG,yBAAyB,CAAC,OAAY;AACrD,aAAK,sBAAsB,IAAI,GAAG,MAAM;AACxC,aAAK,aAAa;AAAA,MACpB,CAAC;AAGD,WAAK,SAAS,GAAG,2BAA2B,CAAC,OAAY;AACvD,aAAK,sBAAsB,OAAO,GAAG,MAAM;AAC3C,aAAK,aAAa;AAAA,MACpB,CAAC;AAGD,WAAK,SAAS,GAAG,wBAAwB,CAAC,OAAY;AACpD,aAAK,sBAAsB,OAAO,GAAG,MAAM;AAC3C,aAAK,aAAa;AAAA,MACpB,CAAC;AAED,WAAK,SAAS,GAAG,qBAAqB,CAAC,YAAkC;AACvE,aAAK,KAAK,qBAAqB,OAAO;AAAA,MACxC,CAAC;AAED,cAAQ,KAAK,QAAQ;AAAA,IACvB,CAAC;AAAA,EACH;AAAA,EAEA,iBAAiB,qBAAmC;AAClD,QAAI,CAAC,KAAK,MAAM;AACd,WAAK,QAAQ,MAAM,iBAAiB;AACpC;AAAA,IACF;AAEA,SAAK,oBAAoB,KAAK,KAAK,mBAAmB,IAAI,mBAAmB,KAAK;AAClF,QAAI,CAAC,KAAK,mBAAmB;AAC3B,WAAK,QAAQ,MAAM,6BAA6B,mBAAmB,YAAY;AAC/E;AAAA,IACF;AAEA,QAAI,KAAK,kBAAkB,kBAAkB,OAAO,GAAG;AACrD,WAAK,uBAAuB;AAAA,IAC9B;AAGA,eAAW,eAAe,KAAK,kBAAkB,kBAAkB,OAAO,GAAG;AAC3E,UAAI,YAAY,WAAW,YAAY,qBAAqB,YAAY,OAAO;AAC7E,aAAK,yBAAyB,YAAY,OAAO,aAAa,KAAK,iBAAiB;AACpF;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,yBAA+B;AAC7B,QAAI,CAAC,KAAK,mBAAmB;AAC3B,WAAK,QAAQ,MAAM,wBAAwB;AAC3C;AAAA,IACF;AAEA,QAAI,wBAA4D;AAChE,eAAW,eAAe,KAAK,kBAAkB,kBAAkB,OAAO,GAAG;AAC3E,UAAI,YAAY,WAAW,YAAY,mBAAmB;AACxD,gCAAwB;AACxB;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,uBAAuB;AAC1B;AAAA,IACF;AAEA,QAAI,CAAC,sBAAsB,YAAY;AACrC,4BAAsB,cAAc,IAAI;AAAA,IAC1C;AAAA,EACF;AAAA,EAEA,yBACE,OACA,aACA,aACA;AAhbJ;AAibI,QACE,YAAY,WAAW,YAAY,qBACnC,YAAY,eAAa,UAAK,sBAAL,mBAAwB,WACjD;AACA;AAAA,IACF;AACA,UAAM,sBAAsB,OAAO,gBAA6B;AAC9D,YAAM,UAAU,IAAI;AAAA,QAClB,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,QACX,KAAK,MAAM;AAAA,MACb;AAEA,uBAAiB,SAAS,aAAa;AACrC,cAAM,YAAY,MAAM;AACxB,mBAAWC,UAAS,QAAQ,MAAM,UAAU,MAAM,GAAG;AACnD,eAAK,SAAU,iBAAiB,OAAOA,MAAK;AAAA,QAC9C;AAAA,MACF;AAAA,IACF;AACA,SAAK,kBAAkB;AAEvB,SAAK,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC1D,0BAAoB,IAAI,YAAY,OAAO,KAAK,MAAM,YAAY,KAAK,MAAM,WAAW,CAAC,EACtF,KAAK,OAAO,EACZ,MAAM,MAAM;AAAA,IACjB,CAAC;AAAA,EACH;AAAA,EAEA,oBAAmC;AACjC,QAAI,CAAC,KAAK,kBAAkB,KAAK,QAAQ,KAAK,KAAK,kBAAkB;AACnE,WAAK,iBAAiB,iBAAiB,KAAK,MAAM,KAAK,KAAK,iBAAkB,QAAS;AAAA,IACzF;AACA,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,sBACE,qBACA,UACA,MACA,SACA,IACM;AA3dV;AA4dI,SAAK,QAAQ;AAAA,MACX,4BAA4B,mBAAmB,IAAI,QAAQ,IAAI,IAAI,IAAI,OAAO,IAAI,EAAE;AAAA,IACtF;AACA,QAAI,GAAC,UAAK,SAAL,mBAAW,mBAAkB;AAChC,WAAK,QAAQ,MAAM,mCAAmC;AACtD;AAAA,IACF;AAEA,SAAK,KAAK,iBAAiB,qBAAqB;AAAA,MAC9C;AAAA,MACA;AAAA,MACA,UAAU;AAAA,QACR;AAAA,UACE;AAAA,UACA,OAAO;AAAA,UACP;AAAA,UACA,WAAW,OAAO,CAAC;AAAA,UACnB,SAAS,OAAO,CAAC;AAAA,UACjB,UAAU;AAAA,QACZ;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,eAAe;AACb,QAAI,WAAuB;AAC3B,QAAI,KAAK,sBAAsB,OAAO,GAAG;AACvC,iBAAW;AAAA,IACb,WAAW,KAAK,WAAW;AACzB,iBAAW;AAAA,IACb,WAAW,KAAK,UAAU;AACxB,iBAAW;AAAA,IACb;AAEA,SAAK,UAAU,QAAQ;AAAA,EACzB;AAAA,EAEA,UAAU,OAAmB;AAjgB/B;AAkgBI,UAAI,UAAK,SAAL,mBAAW,gBAAe,KAAK,KAAK,kBAAkB;AACxD,YAAM,eAAe,KAAK,KAAK,iBAAiB,WAAY,qBAAqB;AACjF,UAAI,iBAAiB,OAAO;AAC1B,aAAK,KAAK,iBAAiB,cAAc;AAAA,UACvC,CAAC,qBAAqB,GAAG;AAAA,QAC3B,CAAC;AACD,aAAK,QAAQ,MAAM,GAAG,qBAAqB,KAAK,YAAY,MAAM,KAAK,EAAE;AAAA,MAC3E;AAAA,IACF;AAAA,EACF;AACF;","names":["participant","_a","frame"]}
@@ -36,11 +36,13 @@ class SynthesisHandle {
36
36
  #playHandle;
37
37
  intFut = new import_utils.Future();
38
38
  #logger = (0, import_log.log)();
39
- constructor(speechId, ttsSource, agentPlayout, tts) {
39
+ synchronizer;
40
+ constructor(speechId, ttsSource, agentPlayout, tts, synchronizer) {
40
41
  this.#speechId = speechId;
41
42
  this.ttsSource = ttsSource;
42
43
  this.#agentPlayout = agentPlayout;
43
44
  this.tts = tts;
45
+ this.synchronizer = synchronizer;
44
46
  }
45
47
  get speechId() {
46
48
  return this.#speechId;
@@ -59,7 +61,7 @@ class SynthesisHandle {
59
61
  if (this.interrupted) {
60
62
  throw new Error("synthesis was interrupted");
61
63
  }
62
- this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue);
64
+ this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue, this.synchronizer);
63
65
  return this.#playHandle;
64
66
  }
65
67
  /** Interrupt the speech. */
@@ -88,8 +90,14 @@ class AgentOutput {
88
90
  this.#tasks.forEach((task) => task.cancel());
89
91
  await Promise.all(this.#tasks);
90
92
  }
91
- synthesize(speechId, ttsSource) {
92
- const handle = new SynthesisHandle(speechId, ttsSource, this.#agentPlayout, this.#tts);
93
+ synthesize(speechId, ttsSource, synchronizer) {
94
+ const handle = new SynthesisHandle(
95
+ speechId,
96
+ ttsSource,
97
+ this.#agentPlayout,
98
+ this.#tts,
99
+ synchronizer
100
+ );
93
101
  const task = this.#synthesize(handle);
94
102
  this.#tasks.push(task);
95
103
  task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));
@@ -130,6 +138,8 @@ const stringSynthesisTask = (text, handle) => {
130
138
  });
131
139
  const ttsStream = handle.tts.stream();
132
140
  ttsStream.pushText(text);
141
+ handle.synchronizer.pushText(text);
142
+ handle.synchronizer.markTextSegmentEnd();
133
143
  ttsStream.flush();
134
144
  ttsStream.endInput();
135
145
  for await (const audio of ttsStream) {
@@ -164,8 +174,14 @@ const streamSynthesisTask = (stream, handle) => {
164
174
  for await (const text of stream) {
165
175
  fullText += text;
166
176
  if (cancelled) break;
177
+ handle.synchronizer.pushText(text);
167
178
  ttsStream.pushText(text);
168
179
  }
180
+ handle.synchronizer.markTextSegmentEnd();
181
+ if (!fullText || fullText.trim().length === 0) {
182
+ cancelled = true;
183
+ handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
184
+ }
169
185
  ttsStream.flush();
170
186
  ttsStream.endInput();
171
187
  resolve(fullText);
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipeline/agent_output.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { log } from '../log.js';\nimport { SynthesizeStream, type TTS } from '../tts/index.js';\nimport { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport type { AgentPlayout, PlayoutHandle } from './agent_playout.js';\n\nexport type SpeechSource = AsyncIterable<string> | string | Promise<string>;\n\nexport class SynthesisHandle {\n static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n\n #speechId: string;\n text?: string;\n ttsSource: SpeechSource;\n #agentPlayout: AgentPlayout;\n tts: TTS;\n queue = new AsyncIterableQueue<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>();\n #playHandle?: PlayoutHandle;\n intFut = new Future();\n #logger = log();\n\n constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS) {\n this.#speechId = speechId;\n this.ttsSource = ttsSource;\n this.#agentPlayout = agentPlayout;\n this.tts = tts;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get validated(): boolean {\n return !!this.#playHandle;\n }\n\n get interrupted(): boolean {\n return this.intFut.done;\n }\n\n get playHandle(): PlayoutHandle | undefined {\n return this.#playHandle;\n }\n\n /** Validate the speech for playout. */\n play(): PlayoutHandle {\n if (this.interrupted) {\n throw new Error('synthesis was interrupted');\n }\n\n this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue);\n return this.#playHandle;\n }\n\n /** Interrupt the speech. */\n interrupt() {\n if (this.interrupted) {\n return;\n }\n\n this.#logger.child({ speechId: this.#speechId }).debug('interrupting synthesis/playout');\n this.#playHandle?.interrupt();\n this.intFut.resolve();\n }\n}\n\nexport class AgentOutput {\n #agentPlayout: AgentPlayout;\n #tts: TTS;\n #tasks: CancellablePromise<void>[] = [];\n\n constructor(agentPlayout: AgentPlayout, tts: TTS) {\n this.#agentPlayout = agentPlayout;\n this.#tts = tts;\n }\n\n get playout(): AgentPlayout {\n return this.#agentPlayout;\n }\n\n async close() {\n this.#tasks.forEach((task) => task.cancel());\n await Promise.all(this.#tasks);\n }\n\n synthesize(speechId: string, ttsSource: SpeechSource): SynthesisHandle {\n const handle = new SynthesisHandle(speechId, ttsSource, this.#agentPlayout, this.#tts);\n const task = this.#synthesize(handle);\n this.#tasks.push(task);\n task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));\n return handle;\n }\n\n #synthesize(handle: SynthesisHandle): CancellablePromise<void> {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const ttsSource = await handle.ttsSource;\n let task: CancellablePromise<string>;\n if (typeof ttsSource === 'string') {\n task = stringSynthesisTask(ttsSource, handle);\n } else {\n task = streamSynthesisTask(ttsSource, handle);\n }\n\n onCancel(() => {\n gracefullyCancel(task);\n });\n\n try {\n await Promise.any([task, handle.intFut.await]);\n } finally {\n if (handle.intFut.done) {\n gracefullyCancel(task);\n } else {\n task.then((text) => {\n handle.text = text;\n });\n }\n }\n\n resolve();\n });\n }\n}\n\nconst stringSynthesisTask = (text: string, handle: SynthesisHandle): CancellablePromise<string> => {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const ttsStream = handle.tts.stream();\n ttsStream.pushText(text);\n ttsStream.flush();\n ttsStream.endInput();\n for await (const audio of ttsStream) {\n if (cancelled || audio === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n handle.queue.put(audio.frame);\n }\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n\n resolve(text);\n });\n};\n\nconst streamSynthesisTask = (\n stream: AsyncIterable<string>,\n handle: SynthesisHandle,\n): CancellablePromise<string> => {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let fullText = '';\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const ttsStream = handle.tts.stream();\n const readGeneratedAudio = async () => {\n for await (const audio of ttsStream) {\n if (cancelled) break;\n if (audio === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n handle.queue.put(audio.frame);\n }\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n };\n readGeneratedAudio();\n\n for await (const text of stream) {\n fullText += text;\n if (cancelled) break;\n ttsStream.pushText(text);\n }\n ttsStream.flush();\n ttsStream.endInput();\n\n resolve(fullText);\n });\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,iBAAoB;AACpB,iBAA2C;AAC3C,mBAAiF;AAK1E,MAAM,gBAAgB;AAAA,EAC3B,OAAgB,iBAAiB,OAAO,gBAAgB;AAAA,EAExD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ,IAAI,gCAAuE;AAAA,EACnF;AAAA,EACA,SAAS,IAAI,oBAAO;AAAA,EACpB,cAAU,gBAAI;AAAA,EAEd,YAAY,UAAkB,WAAyB,cAA4B,KAAU;AAC3F,SAAK,YAAY;AACjB,SAAK,YAAY;AACjB,SAAK,gBAAgB;AACrB,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,YAAqB;AACvB,WAAO,CAAC,CAAC,KAAK;AAAA,EAChB;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK,OAAO;AAAA,EACrB;AAAA,EAEA,IAAI,aAAwC;AAC1C,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,OAAsB;AACpB,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AAEA,SAAK,cAAc,KAAK,cAAc,KAAK,KAAK,WAAW,KAAK,KAAK;AACrE,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,YAAY;AA1Dd;AA2DI,QAAI,KAAK,aAAa;AACpB;AAAA,IACF;AAEA,SAAK,QAAQ,MAAM,EAAE,UAAU,KAAK,UAAU,CAAC,EAAE,MAAM,gCAAgC;AACvF,eAAK,gBAAL,mBAAkB;AAClB,SAAK,OAAO,QAAQ;AAAA,EACtB;AACF;AAEO,MAAM,YAAY;AAAA,EACvB;AAAA,EACA;AAAA,EACA,SAAqC,CAAC;AAAA,EAEtC,YAAY,cAA4B,KAAU;AAChD,SAAK,gBAAgB;AACrB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,IAAI,UAAwB;AAC1B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,OAAO,QAAQ,CAAC,SAAS,KAAK,OAAO,CAAC;AAC3C,UAAM,QAAQ,IAAI,KAAK,MAAM;AAAA,EAC/B;AAAA,EAEA,WAAW,UAAkB,WAA0C;AACrE,UAAM,SAAS,IAAI,gBAAgB,UAAU,WAAW,KAAK,eAAe,KAAK,IAAI;AACrF,UAAM,OAAO,KAAK,YAAY,MAAM;AACpC,SAAK,OAAO,KAAK,IAAI;AACrB,SAAK,QAAQ,MAAM,KAAK,OAAO,OAAO,KAAK,OAAO,QAAQ,IAAI,CAAC,CAAC;AAChE,WAAO;AAAA,EACT;AAAA,EAEA,YAAY,QAAmD;AAE7D,WAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,YAAY,MAAM,OAAO;AAC/B,UAAI;AACJ,UAAI,OAAO,cAAc,UAAU;AACjC,eAAO,oBAAoB,WAAW,MAAM;AAAA,MAC9C,OAAO;AACL,eAAO,oBAAoB,WAAW,MAAM;AAAA,MAC9C;AAEA,eAAS,MAAM;AACb,2CAAiB,IAAI;AAAA,MACvB,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,MAAM,OAAO,OAAO,KAAK,CAAC;AAAA,MAC/C,UAAE;AACA,YAAI,OAAO,OAAO,MAAM;AACtB,6CAAiB,IAAI;AAAA,QACvB,OAAO;AACL,eAAK,KAAK,CAAC,SAAS;AAClB,mBAAO,OAAO;AAAA,UAChB,CAAC;AAAA,QACH;AAAA,MACF;AAEA,cAAQ;AAAA,IACV,CAAC;AAAA,EACH;AACF;AAEA,MAAM,sBAAsB,CAAC,MAAc,WAAwD;AAEjG,SAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,QAAI,YAAY;AAChB,aAAS,MAAM;AACb,kBAAY;AAAA,IACd,CAAC;AAED,UAAM,YAAY,OAAO,IAAI,OAAO;AACpC,cAAU,SAAS,IAAI;AACvB,cAAU,MAAM;AAChB,cAAU,SAAS;AACnB,qBAAiB,SAAS,WAAW;AACnC,UAAI,aAAa,UAAU,4BAAiB,eAAe;AACzD;AAAA,MACF;AACA,aAAO,MAAM,IAAI,MAAM,KAAK;AAAA,IAC9B;AACA,WAAO,MAAM,IAAI,gBAAgB,cAAc;AAE/C,YAAQ,IAAI;AAAA,EACd,CAAC;AACH;AAEA,MAAM,sBAAsB,CAC1B,QACA,WAC+B;AAE/B,SAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,QAAI,WAAW;AACf,QAAI,YAAY;AAChB,aAAS,MAAM;AACb,kBAAY;AAAA,IACd,CAAC;AAED,UAAM,YAAY,OAAO,IAAI,OAAO;AACpC,UAAM,qBAAqB,YAAY;AACrC,uBAAiB,SAAS,WAAW;AACnC,YAAI,UAAW;AACf,YAAI,UAAU,4BAAiB,eAAe;AAC5C;AAAA,QACF;AACA,eAAO,MAAM,IAAI,MAAM,KAAK;AAAA,MAC9B;AACA,aAAO,MAAM,IAAI,gBAAgB,cAAc;AAAA,IACjD;AACA,uBAAmB;AAEnB,qBAAiB,QAAQ,QAAQ;AAC/B,kBAAY;AACZ,UAAI,UAAW;AACf,gBAAU,SAAS,IAAI;AAAA,IACzB;AACA,cAAU,MAAM;AAChB,cAAU,SAAS;AAEnB,YAAQ,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
1
+ {"version":3,"sources":["../../src/pipeline/agent_output.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { log } from '../log.js';\nimport type { TextAudioSynchronizer } from '../transcription.js';\nimport { SynthesizeStream, type TTS } from '../tts/index.js';\nimport { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport type { AgentPlayout, PlayoutHandle } from './agent_playout.js';\n\nexport type SpeechSource = AsyncIterable<string> | string | Promise<string>;\n\nexport class SynthesisHandle {\n static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n\n #speechId: string;\n text?: string;\n ttsSource: SpeechSource;\n #agentPlayout: AgentPlayout;\n tts: TTS;\n queue = new AsyncIterableQueue<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>();\n #playHandle?: PlayoutHandle;\n intFut = new Future();\n #logger = log();\n synchronizer: TextAudioSynchronizer;\n\n constructor(\n speechId: string,\n ttsSource: SpeechSource,\n agentPlayout: AgentPlayout,\n tts: TTS,\n synchronizer: TextAudioSynchronizer,\n ) {\n this.#speechId = speechId;\n this.ttsSource = ttsSource;\n this.#agentPlayout = agentPlayout;\n this.tts = tts;\n this.synchronizer = synchronizer;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get validated(): boolean {\n return !!this.#playHandle;\n }\n\n get interrupted(): boolean {\n return this.intFut.done;\n }\n\n get playHandle(): PlayoutHandle | undefined {\n return this.#playHandle;\n }\n\n /** Validate the speech for playout. */\n play(): PlayoutHandle {\n if (this.interrupted) {\n throw new Error('synthesis was interrupted');\n }\n\n this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue, this.synchronizer);\n return this.#playHandle;\n }\n\n /** Interrupt the speech. */\n interrupt() {\n if (this.interrupted) {\n return;\n }\n\n this.#logger.child({ speechId: this.#speechId }).debug('interrupting synthesis/playout');\n this.#playHandle?.interrupt();\n this.intFut.resolve();\n }\n}\n\nexport class AgentOutput {\n #agentPlayout: AgentPlayout;\n #tts: TTS;\n #tasks: CancellablePromise<void>[] = [];\n\n constructor(agentPlayout: AgentPlayout, tts: TTS) {\n this.#agentPlayout = agentPlayout;\n this.#tts = tts;\n }\n\n get playout(): AgentPlayout {\n return this.#agentPlayout;\n }\n\n async close() {\n this.#tasks.forEach((task) => task.cancel());\n await Promise.all(this.#tasks);\n }\n\n synthesize(\n speechId: string,\n ttsSource: SpeechSource,\n synchronizer: TextAudioSynchronizer,\n ): SynthesisHandle {\n const handle = new SynthesisHandle(\n speechId,\n ttsSource,\n this.#agentPlayout,\n this.#tts,\n synchronizer,\n );\n const task = this.#synthesize(handle);\n this.#tasks.push(task);\n task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));\n return handle;\n }\n\n #synthesize(handle: SynthesisHandle): CancellablePromise<void> {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const ttsSource = await handle.ttsSource;\n let task: CancellablePromise<string>;\n if (typeof ttsSource === 'string') {\n task = stringSynthesisTask(ttsSource, handle);\n } else {\n task = streamSynthesisTask(ttsSource, handle);\n }\n\n onCancel(() => {\n gracefullyCancel(task);\n });\n\n try {\n await Promise.any([task, handle.intFut.await]);\n } finally {\n if (handle.intFut.done) {\n gracefullyCancel(task);\n } else {\n task.then((text) => {\n handle.text = text;\n });\n }\n }\n\n resolve();\n });\n }\n}\n\nconst stringSynthesisTask = (text: string, handle: SynthesisHandle): CancellablePromise<string> => {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const ttsStream = handle.tts.stream();\n ttsStream.pushText(text);\n handle.synchronizer.pushText(text);\n handle.synchronizer.markTextSegmentEnd();\n ttsStream.flush();\n ttsStream.endInput();\n for await (const audio of ttsStream) {\n if (cancelled || audio === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n handle.queue.put(audio.frame);\n }\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n\n resolve(text);\n });\n};\n\nconst streamSynthesisTask = (\n stream: AsyncIterable<string>,\n handle: SynthesisHandle,\n): CancellablePromise<string> => {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let fullText = '';\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const ttsStream = handle.tts.stream();\n const readGeneratedAudio = async () => {\n for await (const audio of ttsStream) {\n if (cancelled) break;\n if (audio === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n handle.queue.put(audio.frame);\n }\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n };\n readGeneratedAudio();\n\n for await (const text of stream) {\n fullText += text;\n if (cancelled) break;\n handle.synchronizer.pushText(text);\n ttsStream.pushText(text);\n }\n handle.synchronizer.markTextSegmentEnd();\n\n // end the audio queue early if there is no actual text to turn into speech\n if (!fullText || fullText.trim().length === 0) {\n cancelled = true;\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n }\n ttsStream.flush();\n ttsStream.endInput();\n\n resolve(fullText);\n });\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,iBAAoB;AAEpB,iBAA2C;AAC3C,mBAAiF;AAK1E,MAAM,gBAAgB;AAAA,EAC3B,OAAgB,iBAAiB,OAAO,gBAAgB;AAAA,EAExD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ,IAAI,gCAAuE;AAAA,EACnF;AAAA,EACA,SAAS,IAAI,oBAAO;AAAA,EACpB,cAAU,gBAAI;AAAA,EACd;AAAA,EAEA,YACE,UACA,WACA,cACA,KACA,cACA;AACA,SAAK,YAAY;AACjB,SAAK,YAAY;AACjB,SAAK,gBAAgB;AACrB,SAAK,MAAM;AACX,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,YAAqB;AACvB,WAAO,CAAC,CAAC,KAAK;AAAA,EAChB;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK,OAAO;AAAA,EACrB;AAAA,EAEA,IAAI,aAAwC;AAC1C,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,OAAsB;AACpB,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AAEA,SAAK,cAAc,KAAK,cAAc,KAAK,KAAK,WAAW,KAAK,OAAO,KAAK,YAAY;AACxF,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,YAAY;AAnEd;AAoEI,QAAI,KAAK,aAAa;AACpB;AAAA,IACF;AAEA,SAAK,QAAQ,MAAM,EAAE,UAAU,KAAK,UAAU,CAAC,EAAE,MAAM,gCAAgC;AACvF,eAAK,gBAAL,mBAAkB;AAClB,SAAK,OAAO,QAAQ;AAAA,EACtB;AACF;AAEO,MAAM,YAAY;AAAA,EACvB;AAAA,EACA;AAAA,EACA,SAAqC,CAAC;AAAA,EAEtC,YAAY,cAA4B,KAAU;AAChD,SAAK,gBAAgB;AACrB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,IAAI,UAAwB;AAC1B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,OAAO,QAAQ,CAAC,SAAS,KAAK,OAAO,CAAC;AAC3C,UAAM,QAAQ,IAAI,KAAK,MAAM;AAAA,EAC/B;AAAA,EAEA,WACE,UACA,WACA,cACiB;AACjB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA;AAAA,MACA,KAAK;AAAA,MACL,KAAK;AAAA,MACL;AAAA,IACF;AACA,UAAM,OAAO,KAAK,YAAY,MAAM;AACpC,SAAK,OAAO,KAAK,IAAI;AACrB,SAAK,QAAQ,MAAM,KAAK,OAAO,OAAO,KAAK,OAAO,QAAQ,IAAI,CAAC,CAAC;AAChE,WAAO;AAAA,EACT;AAAA,EAEA,YAAY,QAAmD;AAE7D,WAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,YAAY,MAAM,OAAO;AAC/B,UAAI;AACJ,UAAI,OAAO,cAAc,UAAU;AACjC,eAAO,oBAAoB,WAAW,MAAM;AAAA,MAC9C,OAAO;AACL,eAAO,oBAAoB,WAAW,MAAM;AAAA,MAC9C;AAEA,eAAS,MAAM;AACb,2CAAiB,IAAI;AAAA,MACvB,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,MAAM,OAAO,OAAO,KAAK,CAAC;AAAA,MAC/C,UAAE;AACA,YAAI,OAAO,OAAO,MAAM;AACtB,6CAAiB,IAAI;AAAA,QACvB,OAAO;AACL,eAAK,KAAK,CAAC,SAAS;AAClB,mBAAO,OAAO;AAAA,UAChB,CAAC;AAAA,QACH;AAAA,MACF;AAEA,cAAQ;AAAA,IACV,CAAC;AAAA,EACH;AACF;AAEA,MAAM,sBAAsB,CAAC,MAAc,WAAwD;AAEjG,SAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,QAAI,YAAY;AAChB,aAAS,MAAM;AACb,kBAAY;AAAA,IACd,CAAC;AAED,UAAM,YAAY,OAAO,IAAI,OAAO;AACpC,cAAU,SAAS,IAAI;AACvB,WAAO,aAAa,SAAS,IAAI;AACjC,WAAO,aAAa,mBAAmB;AACvC,cAAU,MAAM;AAChB,cAAU,SAAS;AACnB,qBAAiB,SAAS,WAAW;AACnC,UAAI,aAAa,UAAU,4BAAiB,eAAe;AACzD;AAAA,MACF;AACA,aAAO,MAAM,IAAI,MAAM,KAAK;AAAA,IAC9B;AACA,WAAO,MAAM,IAAI,gBAAgB,cAAc;AAE/C,YAAQ,IAAI;AAAA,EACd,CAAC;AACH;AAEA,MAAM,sBAAsB,CAC1B,QACA,WAC+B;AAE/B,SAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,QAAI,WAAW;AACf,QAAI,YAAY;AAChB,aAAS,MAAM;AACb,kBAAY;AAAA,IACd,CAAC;AAED,UAAM,YAAY,OAAO,IAAI,OAAO;AACpC,UAAM,qBAAqB,YAAY;AACrC,uBAAiB,SAAS,WAAW;AACnC,YAAI,UAAW;AACf,YAAI,UAAU,4BAAiB,eAAe;AAC5C;AAAA,QACF;AACA,eAAO,MAAM,IAAI,MAAM,KAAK;AAAA,MAC9B;AACA,aAAO,MAAM,IAAI,gBAAgB,cAAc;AAAA,IACjD;AACA,uBAAmB;AAEnB,qBAAiB,QAAQ,QAAQ;AAC/B,kBAAY;AACZ,UAAI,UAAW;AACf,aAAO,aAAa,SAAS,IAAI;AACjC,gBAAU,SAAS,IAAI;AAAA,IACzB;AACA,WAAO,aAAa,mBAAmB;AAGvC,QAAI,CAAC,YAAY,SAAS,KAAK,EAAE,WAAW,GAAG;AAC7C,kBAAY;AACZ,aAAO,MAAM,IAAI,gBAAgB,cAAc;AAAA,IACjD;AACA,cAAU,MAAM;AAChB,cAAU,SAAS;AAEnB,YAAQ,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
@@ -1,4 +1,5 @@
1
1
  import type { AudioFrame } from '@livekit/rtc-node';
2
+ import type { TextAudioSynchronizer } from '../transcription.js';
2
3
  import { type TTS } from '../tts/index.js';
3
4
  import { AsyncIterableQueue, Future } from '../utils.js';
4
5
  import type { AgentPlayout, PlayoutHandle } from './agent_playout.js';
@@ -11,7 +12,8 @@ export declare class SynthesisHandle {
11
12
  tts: TTS;
12
13
  queue: AsyncIterableQueue<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
13
14
  intFut: Future;
14
- constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS);
15
+ synchronizer: TextAudioSynchronizer;
16
+ constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS, synchronizer: TextAudioSynchronizer);
15
17
  get speechId(): string;
16
18
  get validated(): boolean;
17
19
  get interrupted(): boolean;
@@ -26,6 +28,6 @@ export declare class AgentOutput {
26
28
  constructor(agentPlayout: AgentPlayout, tts: TTS);
27
29
  get playout(): AgentPlayout;
28
30
  close(): Promise<void>;
29
- synthesize(speechId: string, ttsSource: SpeechSource): SynthesisHandle;
31
+ synthesize(speechId: string, ttsSource: SpeechSource, synchronizer: TextAudioSynchronizer): SynthesisHandle;
30
32
  }
31
33
  //# sourceMappingURL=agent_output.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"agent_output.d.ts","sourceRoot":"","sources":["../../src/pipeline/agent_output.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAEpD,OAAO,EAAoB,KAAK,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAsB,MAAM,EAAoB,MAAM,aAAa,CAAC;AAC/F,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AAEtE,MAAM,MAAM,YAAY,GAAG,aAAa,CAAC,MAAM,CAAC,GAAG,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;AAE5E,qBAAa,eAAe;;IAC1B,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IAG1D,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,YAAY,CAAC;IAExB,GAAG,EAAE,GAAG,CAAC;IACT,KAAK,yEAAgF;IAErF,MAAM,SAAgB;gBAGV,QAAQ,EAAE,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,YAAY,EAAE,YAAY,EAAE,GAAG,EAAE,GAAG;IAO3F,IAAI,QAAQ,IAAI,MAAM,CAErB;IAED,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,UAAU,IAAI,aAAa,GAAG,SAAS,CAE1C;IAED,uCAAuC;IACvC,IAAI,IAAI,aAAa;IASrB,4BAA4B;IAC5B,SAAS;CASV;AAED,qBAAa,WAAW;;gBAKV,YAAY,EAAE,YAAY,EAAE,GAAG,EAAE,GAAG;IAKhD,IAAI,OAAO,IAAI,YAAY,CAE1B;IAEK,KAAK;IAKX,UAAU,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,EAAE,YAAY,GAAG,eAAe;CAsCvE"}
1
+ {"version":3,"file":"agent_output.d.ts","sourceRoot":"","sources":["../../src/pipeline/agent_output.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAEpD,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AACjE,OAAO,EAAoB,KAAK,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAsB,MAAM,EAAoB,MAAM,aAAa,CAAC;AAC/F,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AAEtE,MAAM,MAAM,YAAY,GAAG,aAAa,CAAC,MAAM,CAAC,GAAG,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;AAE5E,qBAAa,eAAe;;IAC1B,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IAG1D,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,YAAY,CAAC;IAExB,GAAG,EAAE,GAAG,CAAC;IACT,KAAK,yEAAgF;IAErF,MAAM,SAAgB;IAEtB,YAAY,EAAE,qBAAqB,CAAC;gBAGlC,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,YAAY,EACvB,YAAY,EAAE,YAAY,EAC1B,GAAG,EAAE,GAAG,EACR,YAAY,EAAE,qBAAqB;IASrC,IAAI,QAAQ,IAAI,MAAM,CAErB;IAED,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,UAAU,IAAI,aAAa,GAAG,SAAS,CAE1C;IAED,uCAAuC;IACvC,IAAI,IAAI,aAAa;IASrB,4BAA4B;IAC5B,SAAS;CASV;AAED,qBAAa,WAAW;;gBAKV,YAAY,EAAE,YAAY,EAAE,GAAG,EAAE,GAAG;IAKhD,IAAI,OAAO,IAAI,YAAY,CAE1B;IAEK,KAAK;IAKX,UAAU,CACR,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,YAAY,EACvB,YAAY,EAAE,qBAAqB,GAClC,eAAe;CA4CnB"}
@@ -12,11 +12,13 @@ class SynthesisHandle {
12
12
  #playHandle;
13
13
  intFut = new Future();
14
14
  #logger = log();
15
- constructor(speechId, ttsSource, agentPlayout, tts) {
15
+ synchronizer;
16
+ constructor(speechId, ttsSource, agentPlayout, tts, synchronizer) {
16
17
  this.#speechId = speechId;
17
18
  this.ttsSource = ttsSource;
18
19
  this.#agentPlayout = agentPlayout;
19
20
  this.tts = tts;
21
+ this.synchronizer = synchronizer;
20
22
  }
21
23
  get speechId() {
22
24
  return this.#speechId;
@@ -35,7 +37,7 @@ class SynthesisHandle {
35
37
  if (this.interrupted) {
36
38
  throw new Error("synthesis was interrupted");
37
39
  }
38
- this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue);
40
+ this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue, this.synchronizer);
39
41
  return this.#playHandle;
40
42
  }
41
43
  /** Interrupt the speech. */
@@ -64,8 +66,14 @@ class AgentOutput {
64
66
  this.#tasks.forEach((task) => task.cancel());
65
67
  await Promise.all(this.#tasks);
66
68
  }
67
- synthesize(speechId, ttsSource) {
68
- const handle = new SynthesisHandle(speechId, ttsSource, this.#agentPlayout, this.#tts);
69
+ synthesize(speechId, ttsSource, synchronizer) {
70
+ const handle = new SynthesisHandle(
71
+ speechId,
72
+ ttsSource,
73
+ this.#agentPlayout,
74
+ this.#tts,
75
+ synchronizer
76
+ );
69
77
  const task = this.#synthesize(handle);
70
78
  this.#tasks.push(task);
71
79
  task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));
@@ -106,6 +114,8 @@ const stringSynthesisTask = (text, handle) => {
106
114
  });
107
115
  const ttsStream = handle.tts.stream();
108
116
  ttsStream.pushText(text);
117
+ handle.synchronizer.pushText(text);
118
+ handle.synchronizer.markTextSegmentEnd();
109
119
  ttsStream.flush();
110
120
  ttsStream.endInput();
111
121
  for await (const audio of ttsStream) {
@@ -140,8 +150,14 @@ const streamSynthesisTask = (stream, handle) => {
140
150
  for await (const text of stream) {
141
151
  fullText += text;
142
152
  if (cancelled) break;
153
+ handle.synchronizer.pushText(text);
143
154
  ttsStream.pushText(text);
144
155
  }
156
+ handle.synchronizer.markTextSegmentEnd();
157
+ if (!fullText || fullText.trim().length === 0) {
158
+ cancelled = true;
159
+ handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
160
+ }
145
161
  ttsStream.flush();
146
162
  ttsStream.endInput();
147
163
  resolve(fullText);
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipeline/agent_output.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { log } from '../log.js';\nimport { SynthesizeStream, type TTS } from '../tts/index.js';\nimport { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport type { AgentPlayout, PlayoutHandle } from './agent_playout.js';\n\nexport type SpeechSource = AsyncIterable<string> | string | Promise<string>;\n\nexport class SynthesisHandle {\n static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n\n #speechId: string;\n text?: string;\n ttsSource: SpeechSource;\n #agentPlayout: AgentPlayout;\n tts: TTS;\n queue = new AsyncIterableQueue<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>();\n #playHandle?: PlayoutHandle;\n intFut = new Future();\n #logger = log();\n\n constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS) {\n this.#speechId = speechId;\n this.ttsSource = ttsSource;\n this.#agentPlayout = agentPlayout;\n this.tts = tts;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get validated(): boolean {\n return !!this.#playHandle;\n }\n\n get interrupted(): boolean {\n return this.intFut.done;\n }\n\n get playHandle(): PlayoutHandle | undefined {\n return this.#playHandle;\n }\n\n /** Validate the speech for playout. */\n play(): PlayoutHandle {\n if (this.interrupted) {\n throw new Error('synthesis was interrupted');\n }\n\n this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue);\n return this.#playHandle;\n }\n\n /** Interrupt the speech. */\n interrupt() {\n if (this.interrupted) {\n return;\n }\n\n this.#logger.child({ speechId: this.#speechId }).debug('interrupting synthesis/playout');\n this.#playHandle?.interrupt();\n this.intFut.resolve();\n }\n}\n\nexport class AgentOutput {\n #agentPlayout: AgentPlayout;\n #tts: TTS;\n #tasks: CancellablePromise<void>[] = [];\n\n constructor(agentPlayout: AgentPlayout, tts: TTS) {\n this.#agentPlayout = agentPlayout;\n this.#tts = tts;\n }\n\n get playout(): AgentPlayout {\n return this.#agentPlayout;\n }\n\n async close() {\n this.#tasks.forEach((task) => task.cancel());\n await Promise.all(this.#tasks);\n }\n\n synthesize(speechId: string, ttsSource: SpeechSource): SynthesisHandle {\n const handle = new SynthesisHandle(speechId, ttsSource, this.#agentPlayout, this.#tts);\n const task = this.#synthesize(handle);\n this.#tasks.push(task);\n task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));\n return handle;\n }\n\n #synthesize(handle: SynthesisHandle): CancellablePromise<void> {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const ttsSource = await handle.ttsSource;\n let task: CancellablePromise<string>;\n if (typeof ttsSource === 'string') {\n task = stringSynthesisTask(ttsSource, handle);\n } else {\n task = streamSynthesisTask(ttsSource, handle);\n }\n\n onCancel(() => {\n gracefullyCancel(task);\n });\n\n try {\n await Promise.any([task, handle.intFut.await]);\n } finally {\n if (handle.intFut.done) {\n gracefullyCancel(task);\n } else {\n task.then((text) => {\n handle.text = text;\n });\n }\n }\n\n resolve();\n });\n }\n}\n\nconst stringSynthesisTask = (text: string, handle: SynthesisHandle): CancellablePromise<string> => {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const ttsStream = handle.tts.stream();\n ttsStream.pushText(text);\n ttsStream.flush();\n ttsStream.endInput();\n for await (const audio of ttsStream) {\n if (cancelled || audio === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n handle.queue.put(audio.frame);\n }\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n\n resolve(text);\n });\n};\n\nconst streamSynthesisTask = (\n stream: AsyncIterable<string>,\n handle: SynthesisHandle,\n): CancellablePromise<string> => {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let fullText = '';\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const ttsStream = handle.tts.stream();\n const readGeneratedAudio = async () => {\n for await (const audio of ttsStream) {\n if (cancelled) break;\n if (audio === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n handle.queue.put(audio.frame);\n }\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n };\n readGeneratedAudio();\n\n for await (const text of stream) {\n fullText += text;\n if (cancelled) break;\n ttsStream.pushText(text);\n }\n ttsStream.flush();\n ttsStream.endInput();\n\n resolve(fullText);\n });\n};\n"],"mappings":"AAIA,SAAS,WAAW;AACpB,SAAS,wBAAkC;AAC3C,SAAS,oBAAoB,oBAAoB,QAAQ,wBAAwB;AAK1E,MAAM,gBAAgB;AAAA,EAC3B,OAAgB,iBAAiB,OAAO,gBAAgB;AAAA,EAExD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ,IAAI,mBAAuE;AAAA,EACnF;AAAA,EACA,SAAS,IAAI,OAAO;AAAA,EACpB,UAAU,IAAI;AAAA,EAEd,YAAY,UAAkB,WAAyB,cAA4B,KAAU;AAC3F,SAAK,YAAY;AACjB,SAAK,YAAY;AACjB,SAAK,gBAAgB;AACrB,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,YAAqB;AACvB,WAAO,CAAC,CAAC,KAAK;AAAA,EAChB;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK,OAAO;AAAA,EACrB;AAAA,EAEA,IAAI,aAAwC;AAC1C,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,OAAsB;AACpB,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AAEA,SAAK,cAAc,KAAK,cAAc,KAAK,KAAK,WAAW,KAAK,KAAK;AACrE,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,YAAY;AA1Dd;AA2DI,QAAI,KAAK,aAAa;AACpB;AAAA,IACF;AAEA,SAAK,QAAQ,MAAM,EAAE,UAAU,KAAK,UAAU,CAAC,EAAE,MAAM,gCAAgC;AACvF,eAAK,gBAAL,mBAAkB;AAClB,SAAK,OAAO,QAAQ;AAAA,EACtB;AACF;AAEO,MAAM,YAAY;AAAA,EACvB;AAAA,EACA;AAAA,EACA,SAAqC,CAAC;AAAA,EAEtC,YAAY,cAA4B,KAAU;AAChD,SAAK,gBAAgB;AACrB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,IAAI,UAAwB;AAC1B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,OAAO,QAAQ,CAAC,SAAS,KAAK,OAAO,CAAC;AAC3C,UAAM,QAAQ,IAAI,KAAK,MAAM;AAAA,EAC/B;AAAA,EAEA,WAAW,UAAkB,WAA0C;AACrE,UAAM,SAAS,IAAI,gBAAgB,UAAU,WAAW,KAAK,eAAe,KAAK,IAAI;AACrF,UAAM,OAAO,KAAK,YAAY,MAAM;AACpC,SAAK,OAAO,KAAK,IAAI;AACrB,SAAK,QAAQ,MAAM,KAAK,OAAO,OAAO,KAAK,OAAO,QAAQ,IAAI,CAAC,CAAC;AAChE,WAAO;AAAA,EACT;AAAA,EAEA,YAAY,QAAmD;AAE7D,WAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,YAAY,MAAM,OAAO;AAC/B,UAAI;AACJ,UAAI,OAAO,cAAc,UAAU;AACjC,eAAO,oBAAoB,WAAW,MAAM;AAAA,MAC9C,OAAO;AACL,eAAO,oBAAoB,WAAW,MAAM;AAAA,MAC9C;AAEA,eAAS,MAAM;AACb,yBAAiB,IAAI;AAAA,MACvB,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,MAAM,OAAO,OAAO,KAAK,CAAC;AAAA,MAC/C,UAAE;AACA,YAAI,OAAO,OAAO,MAAM;AACtB,2BAAiB,IAAI;AAAA,QACvB,OAAO;AACL,eAAK,KAAK,CAAC,SAAS;AAClB,mBAAO,OAAO;AAAA,UAChB,CAAC;AAAA,QACH;AAAA,MACF;AAEA,cAAQ;AAAA,IACV,CAAC;AAAA,EACH;AACF;AAEA,MAAM,sBAAsB,CAAC,MAAc,WAAwD;AAEjG,SAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,QAAI,YAAY;AAChB,aAAS,MAAM;AACb,kBAAY;AAAA,IACd,CAAC;AAED,UAAM,YAAY,OAAO,IAAI,OAAO;AACpC,cAAU,SAAS,IAAI;AACvB,cAAU,MAAM;AAChB,cAAU,SAAS;AACnB,qBAAiB,SAAS,WAAW;AACnC,UAAI,aAAa,UAAU,iBAAiB,eAAe;AACzD;AAAA,MACF;AACA,aAAO,MAAM,IAAI,MAAM,KAAK;AAAA,IAC9B;AACA,WAAO,MAAM,IAAI,gBAAgB,cAAc;AAE/C,YAAQ,IAAI;AAAA,EACd,CAAC;AACH;AAEA,MAAM,sBAAsB,CAC1B,QACA,WAC+B;AAE/B,SAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,QAAI,WAAW;AACf,QAAI,YAAY;AAChB,aAAS,MAAM;AACb,kBAAY;AAAA,IACd,CAAC;AAED,UAAM,YAAY,OAAO,IAAI,OAAO;AACpC,UAAM,qBAAqB,YAAY;AACrC,uBAAiB,SAAS,WAAW;AACnC,YAAI,UAAW;AACf,YAAI,UAAU,iBAAiB,eAAe;AAC5C;AAAA,QACF;AACA,eAAO,MAAM,IAAI,MAAM,KAAK;AAAA,MAC9B;AACA,aAAO,MAAM,IAAI,gBAAgB,cAAc;AAAA,IACjD;AACA,uBAAmB;AAEnB,qBAAiB,QAAQ,QAAQ;AAC/B,kBAAY;AACZ,UAAI,UAAW;AACf,gBAAU,SAAS,IAAI;AAAA,IACzB;AACA,cAAU,MAAM;AAChB,cAAU,SAAS;AAEnB,YAAQ,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
1
+ {"version":3,"sources":["../../src/pipeline/agent_output.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { log } from '../log.js';\nimport type { TextAudioSynchronizer } from '../transcription.js';\nimport { SynthesizeStream, type TTS } from '../tts/index.js';\nimport { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport type { AgentPlayout, PlayoutHandle } from './agent_playout.js';\n\nexport type SpeechSource = AsyncIterable<string> | string | Promise<string>;\n\nexport class SynthesisHandle {\n static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n\n #speechId: string;\n text?: string;\n ttsSource: SpeechSource;\n #agentPlayout: AgentPlayout;\n tts: TTS;\n queue = new AsyncIterableQueue<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>();\n #playHandle?: PlayoutHandle;\n intFut = new Future();\n #logger = log();\n synchronizer: TextAudioSynchronizer;\n\n constructor(\n speechId: string,\n ttsSource: SpeechSource,\n agentPlayout: AgentPlayout,\n tts: TTS,\n synchronizer: TextAudioSynchronizer,\n ) {\n this.#speechId = speechId;\n this.ttsSource = ttsSource;\n this.#agentPlayout = agentPlayout;\n this.tts = tts;\n this.synchronizer = synchronizer;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get validated(): boolean {\n return !!this.#playHandle;\n }\n\n get interrupted(): boolean {\n return this.intFut.done;\n }\n\n get playHandle(): PlayoutHandle | undefined {\n return this.#playHandle;\n }\n\n /** Validate the speech for playout. */\n play(): PlayoutHandle {\n if (this.interrupted) {\n throw new Error('synthesis was interrupted');\n }\n\n this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue, this.synchronizer);\n return this.#playHandle;\n }\n\n /** Interrupt the speech. */\n interrupt() {\n if (this.interrupted) {\n return;\n }\n\n this.#logger.child({ speechId: this.#speechId }).debug('interrupting synthesis/playout');\n this.#playHandle?.interrupt();\n this.intFut.resolve();\n }\n}\n\nexport class AgentOutput {\n #agentPlayout: AgentPlayout;\n #tts: TTS;\n #tasks: CancellablePromise<void>[] = [];\n\n constructor(agentPlayout: AgentPlayout, tts: TTS) {\n this.#agentPlayout = agentPlayout;\n this.#tts = tts;\n }\n\n get playout(): AgentPlayout {\n return this.#agentPlayout;\n }\n\n async close() {\n this.#tasks.forEach((task) => task.cancel());\n await Promise.all(this.#tasks);\n }\n\n synthesize(\n speechId: string,\n ttsSource: SpeechSource,\n synchronizer: TextAudioSynchronizer,\n ): SynthesisHandle {\n const handle = new SynthesisHandle(\n speechId,\n ttsSource,\n this.#agentPlayout,\n this.#tts,\n synchronizer,\n );\n const task = this.#synthesize(handle);\n this.#tasks.push(task);\n task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));\n return handle;\n }\n\n #synthesize(handle: SynthesisHandle): CancellablePromise<void> {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const ttsSource = await handle.ttsSource;\n let task: CancellablePromise<string>;\n if (typeof ttsSource === 'string') {\n task = stringSynthesisTask(ttsSource, handle);\n } else {\n task = streamSynthesisTask(ttsSource, handle);\n }\n\n onCancel(() => {\n gracefullyCancel(task);\n });\n\n try {\n await Promise.any([task, handle.intFut.await]);\n } finally {\n if (handle.intFut.done) {\n gracefullyCancel(task);\n } else {\n task.then((text) => {\n handle.text = text;\n });\n }\n }\n\n resolve();\n });\n }\n}\n\nconst stringSynthesisTask = (text: string, handle: SynthesisHandle): CancellablePromise<string> => {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const ttsStream = handle.tts.stream();\n ttsStream.pushText(text);\n handle.synchronizer.pushText(text);\n handle.synchronizer.markTextSegmentEnd();\n ttsStream.flush();\n ttsStream.endInput();\n for await (const audio of ttsStream) {\n if (cancelled || audio === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n handle.queue.put(audio.frame);\n }\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n\n resolve(text);\n });\n};\n\nconst streamSynthesisTask = (\n stream: AsyncIterable<string>,\n handle: SynthesisHandle,\n): CancellablePromise<string> => {\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let fullText = '';\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const ttsStream = handle.tts.stream();\n const readGeneratedAudio = async () => {\n for await (const audio of ttsStream) {\n if (cancelled) break;\n if (audio === SynthesizeStream.END_OF_STREAM) {\n break;\n }\n handle.queue.put(audio.frame);\n }\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n };\n readGeneratedAudio();\n\n for await (const text of stream) {\n fullText += text;\n if (cancelled) break;\n handle.synchronizer.pushText(text);\n ttsStream.pushText(text);\n }\n handle.synchronizer.markTextSegmentEnd();\n\n // end the audio queue early if there is no actual text to turn into speech\n if (!fullText || fullText.trim().length === 0) {\n cancelled = true;\n handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);\n }\n ttsStream.flush();\n ttsStream.endInput();\n\n resolve(fullText);\n });\n};\n"],"mappings":"AAIA,SAAS,WAAW;AAEpB,SAAS,wBAAkC;AAC3C,SAAS,oBAAoB,oBAAoB,QAAQ,wBAAwB;AAK1E,MAAM,gBAAgB;AAAA,EAC3B,OAAgB,iBAAiB,OAAO,gBAAgB;AAAA,EAExD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,QAAQ,IAAI,mBAAuE;AAAA,EACnF;AAAA,EACA,SAAS,IAAI,OAAO;AAAA,EACpB,UAAU,IAAI;AAAA,EACd;AAAA,EAEA,YACE,UACA,WACA,cACA,KACA,cACA;AACA,SAAK,YAAY;AACjB,SAAK,YAAY;AACjB,SAAK,gBAAgB;AACrB,SAAK,MAAM;AACX,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,YAAqB;AACvB,WAAO,CAAC,CAAC,KAAK;AAAA,EAChB;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK,OAAO;AAAA,EACrB;AAAA,EAEA,IAAI,aAAwC;AAC1C,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,OAAsB;AACpB,QAAI,KAAK,aAAa;AACpB,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AAEA,SAAK,cAAc,KAAK,cAAc,KAAK,KAAK,WAAW,KAAK,OAAO,KAAK,YAAY;AACxF,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,YAAY;AAnEd;AAoEI,QAAI,KAAK,aAAa;AACpB;AAAA,IACF;AAEA,SAAK,QAAQ,MAAM,EAAE,UAAU,KAAK,UAAU,CAAC,EAAE,MAAM,gCAAgC;AACvF,eAAK,gBAAL,mBAAkB;AAClB,SAAK,OAAO,QAAQ;AAAA,EACtB;AACF;AAEO,MAAM,YAAY;AAAA,EACvB;AAAA,EACA;AAAA,EACA,SAAqC,CAAC;AAAA,EAEtC,YAAY,cAA4B,KAAU;AAChD,SAAK,gBAAgB;AACrB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,IAAI,UAAwB;AAC1B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,OAAO,QAAQ,CAAC,SAAS,KAAK,OAAO,CAAC;AAC3C,UAAM,QAAQ,IAAI,KAAK,MAAM;AAAA,EAC/B;AAAA,EAEA,WACE,UACA,WACA,cACiB;AACjB,UAAM,SAAS,IAAI;AAAA,MACjB;AAAA,MACA;AAAA,MACA,KAAK;AAAA,MACL,KAAK;AAAA,MACL;AAAA,IACF;AACA,UAAM,OAAO,KAAK,YAAY,MAAM;AACpC,SAAK,OAAO,KAAK,IAAI;AACrB,SAAK,QAAQ,MAAM,KAAK,OAAO,OAAO,KAAK,OAAO,QAAQ,IAAI,CAAC,CAAC;AAChE,WAAO;AAAA,EACT;AAAA,EAEA,YAAY,QAAmD;AAE7D,WAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,YAAY,MAAM,OAAO;AAC/B,UAAI;AACJ,UAAI,OAAO,cAAc,UAAU;AACjC,eAAO,oBAAoB,WAAW,MAAM;AAAA,MAC9C,OAAO;AACL,eAAO,oBAAoB,WAAW,MAAM;AAAA,MAC9C;AAEA,eAAS,MAAM;AACb,yBAAiB,IAAI;AAAA,MACvB,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,MAAM,OAAO,OAAO,KAAK,CAAC;AAAA,MAC/C,UAAE;AACA,YAAI,OAAO,OAAO,MAAM;AACtB,2BAAiB,IAAI;AAAA,QACvB,OAAO;AACL,eAAK,KAAK,CAAC,SAAS;AAClB,mBAAO,OAAO;AAAA,UAChB,CAAC;AAAA,QACH;AAAA,MACF;AAEA,cAAQ;AAAA,IACV,CAAC;AAAA,EACH;AACF;AAEA,MAAM,sBAAsB,CAAC,MAAc,WAAwD;AAEjG,SAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,QAAI,YAAY;AAChB,aAAS,MAAM;AACb,kBAAY;AAAA,IACd,CAAC;AAED,UAAM,YAAY,OAAO,IAAI,OAAO;AACpC,cAAU,SAAS,IAAI;AACvB,WAAO,aAAa,SAAS,IAAI;AACjC,WAAO,aAAa,mBAAmB;AACvC,cAAU,MAAM;AAChB,cAAU,SAAS;AACnB,qBAAiB,SAAS,WAAW;AACnC,UAAI,aAAa,UAAU,iBAAiB,eAAe;AACzD;AAAA,MACF;AACA,aAAO,MAAM,IAAI,MAAM,KAAK;AAAA,IAC9B;AACA,WAAO,MAAM,IAAI,gBAAgB,cAAc;AAE/C,YAAQ,IAAI;AAAA,EACd,CAAC;AACH;AAEA,MAAM,sBAAsB,CAC1B,QACA,WAC+B;AAE/B,SAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,QAAI,WAAW;AACf,QAAI,YAAY;AAChB,aAAS,MAAM;AACb,kBAAY;AAAA,IACd,CAAC;AAED,UAAM,YAAY,OAAO,IAAI,OAAO;AACpC,UAAM,qBAAqB,YAAY;AACrC,uBAAiB,SAAS,WAAW;AACnC,YAAI,UAAW;AACf,YAAI,UAAU,iBAAiB,eAAe;AAC5C;AAAA,QACF;AACA,eAAO,MAAM,IAAI,MAAM,KAAK;AAAA,MAC9B;AACA,aAAO,MAAM,IAAI,gBAAgB,cAAc;AAAA,IACjD;AACA,uBAAmB;AAEnB,qBAAiB,QAAQ,QAAQ;AAC/B,kBAAY;AACZ,UAAI,UAAW;AACf,aAAO,aAAa,SAAS,IAAI;AACjC,gBAAU,SAAS,IAAI;AAAA,IACzB;AACA,WAAO,aAAa,mBAAmB;AAGvC,QAAI,CAAC,YAAY,SAAS,KAAK,EAAE,WAAW,GAAG;AAC7C,kBAAY;AACZ,aAAO,MAAM,IAAI,gBAAgB,cAAc;AAAA,IACjD;AACA,cAAU,MAAM;AAChB,cAAU,SAAS;AAEnB,YAAQ,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
@@ -47,14 +47,16 @@ class PlayoutHandle {
47
47
  #audioSource;
48
48
  playoutSource;
49
49
  totalPlayedTime;
50
+ synchronizer;
50
51
  #interrupted = false;
51
52
  pushedDuration = 0;
52
53
  intFut = new import_utils.Future();
53
54
  doneFut = new import_utils.Future();
54
- constructor(speechId, audioSource, playoutSource) {
55
+ constructor(speechId, audioSource, playoutSource, synchronizer) {
55
56
  this.#speechId = speechId;
56
57
  this.#audioSource = audioSource;
57
58
  this.playoutSource = playoutSource;
59
+ this.synchronizer = synchronizer;
58
60
  }
59
61
  get speechId() {
60
62
  return this.#speechId;
@@ -95,11 +97,11 @@ class AgentPlayout extends import_node_events.default {
95
97
  set targetVolume(vol) {
96
98
  this.#targetVolume = vol;
97
99
  }
98
- play(speechId, playoutSource) {
100
+ play(speechId, playoutSource, synchronizer) {
99
101
  if (this.#closed) {
100
102
  throw new Error("source closed");
101
103
  }
102
- const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);
104
+ const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);
103
105
  this.#playoutTask = this.#playout(handle, this.#playoutTask);
104
106
  return handle;
105
107
  }
@@ -109,6 +111,7 @@ class AgentPlayout extends import_node_events.default {
109
111
  captureTask.cancel();
110
112
  handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
111
113
  if (handle.interrupted || captureTask.error) {
114
+ handle.synchronizer.close(true);
112
115
  this.#audioSource.clearQueue();
113
116
  }
114
117
  if (!firstFrame) {
@@ -139,12 +142,15 @@ class AgentPlayout extends import_node_events.default {
139
142
  if (firstFrame) {
140
143
  this.#logger.child({ speechId: handle.speechId }).debug("started playing the first time");
141
144
  this.emit(0 /* PLAYOUT_STARTED */);
145
+ handle.synchronizer.segmentPlayoutStarted();
142
146
  firstFrame = false;
143
147
  }
144
148
  handle.pushedDuration += frame.samplesPerChannel / frame.sampleRate * 1e3;
149
+ handle.synchronizer.pushAudio(frame);
145
150
  await this.#audioSource.captureFrame(frame);
146
151
  await this.#audioSource.waitForPlayout();
147
152
  }
153
+ handle.synchronizer.close(false);
148
154
  resolve2();
149
155
  });
150
156
  try {
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipeline/agent_playout.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame, AudioSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport { log } from '../log.js';\nimport { CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { SynthesisHandle } from './agent_output.js';\n\nexport enum AgentPlayoutEvent {\n PLAYOUT_STARTED,\n PLAYOUT_STOPPED,\n}\n\nexport type AgentPlayoutCallbacks = {\n [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;\n [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;\n};\n\nexport class PlayoutHandle {\n #speechId: string;\n #audioSource: AudioSource;\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;\n totalPlayedTime?: number;\n #interrupted = false;\n pushedDuration = 0;\n intFut = new Future();\n doneFut = new Future();\n\n constructor(\n speechId: string,\n audioSource: AudioSource,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n ) {\n this.#speechId = speechId;\n this.#audioSource = audioSource;\n this.playoutSource = playoutSource;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get interrupted(): boolean {\n return this.#interrupted;\n }\n\n get timePlayed(): number {\n return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;\n }\n\n get done(): boolean {\n return this.doneFut.done || this.#interrupted;\n }\n\n interrupt() {\n if (this.done) {\n return;\n }\n\n this.intFut.resolve();\n this.#interrupted = true;\n }\n\n join(): Future {\n return this.doneFut;\n }\n}\n\nexport class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {\n #closed = false;\n #audioSource: AudioSource;\n #targetVolume = 1;\n #playoutTask?: CancellablePromise<void>;\n #logger = log();\n\n constructor(audioSource: AudioSource) {\n super();\n this.#audioSource = audioSource;\n }\n\n get targetVolume(): number {\n return this.#targetVolume;\n }\n\n set targetVolume(vol: number) {\n this.#targetVolume = vol;\n }\n\n play(\n speechId: string,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n ): PlayoutHandle {\n if (this.#closed) {\n throw new Error('source closed');\n }\n\n const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);\n\n this.#playoutTask = this.#playout(handle, this.#playoutTask);\n return handle;\n }\n\n #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const cancel = () => {\n captureTask.cancel();\n handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;\n\n if (handle.interrupted || captureTask.error) {\n this.#audioSource.clearQueue(); // make sure to remove any queued frames\n }\n\n if (!firstFrame) {\n this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);\n }\n\n handle.doneFut.resolve();\n\n this.#logger\n .child({ speechId: handle.speechId, interrupted: handle.interrupted })\n .debug('playout finished');\n };\n\n onCancel(() => {\n cancel();\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n if (this.#audioSource.queuedDuration > 0) {\n // this should not happen, but log it just in case\n this.#logger\n .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })\n .warn('new playout while the source is still playing');\n }\n\n let firstFrame = true;\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n for await (const frame of handle.playoutSource) {\n if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {\n break;\n }\n if (firstFrame) {\n this.#logger\n .child({ speechId: handle.speechId })\n .debug('started playing the first time');\n this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);\n firstFrame = false;\n }\n handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;\n await this.#audioSource.captureFrame(frame);\n await this.#audioSource.waitForPlayout();\n }\n\n // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,\n // but for some reason too many TTS frames can gunk up the buffer and lead to\n // FFI errors. this works 🤷‍♀️\n // if (this.#audioSource.queuedDuration > 0) {\n // await this.#audioSource.waitForPlayout();\n // }\n\n resolve();\n });\n\n try {\n await Promise.any([captureTask, handle.intFut.await]);\n } finally {\n cancel();\n resolve();\n }\n });\n }\n\n async close() {\n this.#closed = true;\n await this.#playoutTask;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAAyB;AACzB,iBAAoB;AACpB,mBAA6D;AAC7D,0BAAgC;AAEzB,IAAK,oBAAL,kBAAKA,uBAAL;AACL,EAAAA,sCAAA;AACA,EAAAA,sCAAA;AAFU,SAAAA;AAAA,GAAA;AAUL,MAAM,cAAc;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,eAAe;AAAA,EACf,iBAAiB;AAAA,EACjB,SAAS,IAAI,oBAAO;AAAA,EACpB,UAAU,IAAI,oBAAO;AAAA,EAErB,YACE,UACA,aACA,eACA;AACA,SAAK,YAAY;AACjB,SAAK,eAAe;AACpB,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK,mBAAmB,KAAK,iBAAiB,KAAK,aAAa;AAAA,EACzE;AAAA,EAEA,IAAI,OAAgB;AAClB,WAAO,KAAK,QAAQ,QAAQ,KAAK;AAAA,EACnC;AAAA,EAEA,YAAY;AACV,QAAI,KAAK,MAAM;AACb;AAAA,IACF;AAEA,SAAK,OAAO,QAAQ;AACpB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,OAAe;AACb,WAAO,KAAK;AAAA,EACd;AACF;AAEO,MAAM,qBAAsB,mBAAAC,QAA+D;AAAA,EAChG,UAAU;AAAA,EACV;AAAA,EACA,gBAAgB;AAAA,EAChB;AAAA,EACA,cAAU,gBAAI;AAAA,EAEd,YAAY,aAA0B;AACpC,UAAM;AACN,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAa,KAAa;AAC5B,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,KACE,UACA,eACe;AACf,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,eAAe;AAAA,IACjC;AAEA,UAAM,SAAS,IAAI,cAAc,UAAU,KAAK,cAAc,aAAa;AAE3E,SAAK,eAAe,KAAK,SAAS,QAAQ,KAAK,YAAY;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,SAAS,QAAuB,SAA8D;AAC5F,WAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,SAAS,MAAM;AACnB,oBAAY,OAAO;AACnB,eAAO,kBAAkB,OAAO,iBAAiB,KAAK,aAAa;AAEnE,YAAI,OAAO,eAAe,YAAY,OAAO;AAC3C,eAAK,aAAa,WAAW;AAAA,QAC/B;AAEA,YAAI,CAAC,YAAY;AACf,eAAK,KAAK,yBAAmC,OAAO,WAAW;AAAA,QACjE;AAEA,eAAO,QAAQ,QAAQ;AAEvB,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,aAAa,OAAO,YAAY,CAAC,EACpE,MAAM,kBAAkB;AAAA,MAC7B;AAEA,eAAS,MAAM;AACb,eAAO;AAAA,MACT,CAAC;AAED,UAAI,SAAS;AACX,kBAAM,+BAAiB,OAAO;AAAA,MAChC;AAEA,UAAI,KAAK,aAAa,iBAAiB,GAAG;AAExC,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,gBAAgB,KAAK,aAAa,eAAe,CAAC,EACrF,KAAK,+CAA+C;AAAA,MACzD;AAEA,UAAI,aAAa;AAGjB,YAAM,cAAc,IAAI,gCAAyB,OAAOC,UAASC,IAAGC,cAAa;AAC/E,YAAI,YAAY;AAChB,QAAAA,UAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,yBAAiB,SAAS,OAAO,eAAe;AAC9C,cAAI,aAAa,UAAU,oCAAgB,gBAAgB;AACzD;AAAA,UACF;AACA,cAAI,YAAY;AACd,iBAAK,QACF,MAAM,EAAE,UAAU,OAAO,SAAS,CAAC,EACnC,MAAM,gCAAgC;AACzC,iBAAK,KAAK,uBAAiC;AAC3C,yBAAa;AAAA,UACf;AACA,iBAAO,kBAAmB,MAAM,oBAAoB,MAAM,aAAc;AACxE,gBAAM,KAAK,aAAa,aAAa,KAAK;AAC1C,gBAAM,KAAK,aAAa,eAAe;AAAA,QACzC;AASA,QAAAF,SAAQ;AAAA,MACV,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,aAAa,OAAO,OAAO,KAAK,CAAC;AAAA,MACtD,UAAE;AACA,eAAO;AACP,gBAAQ;AAAA,MACV;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU;AACf,UAAM,KAAK;AAAA,EACb;AACF;","names":["AgentPlayoutEvent","EventEmitter","resolve","_","onCancel"]}
1
+ {"version":3,"sources":["../../src/pipeline/agent_playout.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame, AudioSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport { log } from '../log.js';\nimport type { TextAudioSynchronizer } from '../transcription.js';\nimport { CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { SynthesisHandle } from './agent_output.js';\n\nexport enum AgentPlayoutEvent {\n PLAYOUT_STARTED,\n PLAYOUT_STOPPED,\n}\n\nexport type AgentPlayoutCallbacks = {\n [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;\n [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;\n};\n\nexport class PlayoutHandle {\n #speechId: string;\n #audioSource: AudioSource;\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;\n totalPlayedTime?: number;\n synchronizer: TextAudioSynchronizer;\n #interrupted = false;\n pushedDuration = 0;\n intFut = new Future();\n doneFut = new Future();\n\n constructor(\n speechId: string,\n audioSource: AudioSource,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n synchronizer: TextAudioSynchronizer,\n ) {\n this.#speechId = speechId;\n this.#audioSource = audioSource;\n this.playoutSource = playoutSource;\n this.synchronizer = synchronizer;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get interrupted(): boolean {\n return this.#interrupted;\n }\n\n get timePlayed(): number {\n return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;\n }\n\n get done(): boolean {\n return this.doneFut.done || this.#interrupted;\n }\n\n interrupt() {\n if (this.done) {\n return;\n }\n\n this.intFut.resolve();\n this.#interrupted = true;\n }\n\n join(): Future {\n return this.doneFut;\n }\n}\n\nexport class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {\n #closed = false;\n #audioSource: AudioSource;\n #targetVolume = 1;\n #playoutTask?: CancellablePromise<void>;\n #logger = log();\n\n constructor(audioSource: AudioSource) {\n super();\n this.#audioSource = audioSource;\n }\n\n get targetVolume(): number {\n return this.#targetVolume;\n }\n\n set targetVolume(vol: number) {\n this.#targetVolume = vol;\n }\n\n play(\n speechId: string,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n synchronizer: TextAudioSynchronizer,\n ): PlayoutHandle {\n if (this.#closed) {\n throw new Error('source closed');\n }\n\n const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);\n\n this.#playoutTask = this.#playout(handle, this.#playoutTask);\n return handle;\n }\n\n #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const cancel = () => {\n captureTask.cancel();\n handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;\n\n if (handle.interrupted || captureTask.error) {\n handle.synchronizer.close(true);\n this.#audioSource.clearQueue(); // make sure to remove any queued frames\n }\n\n if (!firstFrame) {\n this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);\n }\n\n handle.doneFut.resolve();\n\n this.#logger\n .child({ speechId: handle.speechId, interrupted: handle.interrupted })\n .debug('playout finished');\n };\n\n onCancel(() => {\n cancel();\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n if (this.#audioSource.queuedDuration > 0) {\n // this should not happen, but log it just in case\n this.#logger\n .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })\n .warn('new playout while the source is still playing');\n }\n\n let firstFrame = true;\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n for await (const frame of handle.playoutSource) {\n if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {\n break;\n }\n if (firstFrame) {\n this.#logger\n .child({ speechId: handle.speechId })\n .debug('started playing the first time');\n this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);\n handle.synchronizer.segmentPlayoutStarted();\n firstFrame = false;\n }\n handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;\n handle.synchronizer.pushAudio(frame);\n await this.#audioSource.captureFrame(frame);\n await this.#audioSource.waitForPlayout();\n }\n\n // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,\n // but for some reason too many TTS frames can gunk up the buffer and lead to\n // FFI errors. this works 🤷‍♀️\n // if (this.#audioSource.queuedDuration > 0) {\n // await this.#audioSource.waitForPlayout();\n // }\n\n handle.synchronizer.close(false);\n resolve();\n });\n\n try {\n await Promise.any([captureTask, handle.intFut.await]);\n } finally {\n cancel();\n resolve();\n }\n });\n }\n\n async close() {\n this.#closed = true;\n await this.#playoutTask;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAAyB;AACzB,iBAAoB;AAEpB,mBAA6D;AAC7D,0BAAgC;AAEzB,IAAK,oBAAL,kBAAKA,uBAAL;AACL,EAAAA,sCAAA;AACA,EAAAA,sCAAA;AAFU,SAAAA;AAAA,GAAA;AAUL,MAAM,cAAc;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,eAAe;AAAA,EACf,iBAAiB;AAAA,EACjB,SAAS,IAAI,oBAAO;AAAA,EACpB,UAAU,IAAI,oBAAO;AAAA,EAErB,YACE,UACA,aACA,eACA,cACA;AACA,SAAK,YAAY;AACjB,SAAK,eAAe;AACpB,SAAK,gBAAgB;AACrB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK,mBAAmB,KAAK,iBAAiB,KAAK,aAAa;AAAA,EACzE;AAAA,EAEA,IAAI,OAAgB;AAClB,WAAO,KAAK,QAAQ,QAAQ,KAAK;AAAA,EACnC;AAAA,EAEA,YAAY;AACV,QAAI,KAAK,MAAM;AACb;AAAA,IACF;AAEA,SAAK,OAAO,QAAQ;AACpB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,OAAe;AACb,WAAO,KAAK;AAAA,EACd;AACF;AAEO,MAAM,qBAAsB,mBAAAC,QAA+D;AAAA,EAChG,UAAU;AAAA,EACV;AAAA,EACA,gBAAgB;AAAA,EAChB;AAAA,EACA,cAAU,gBAAI;AAAA,EAEd,YAAY,aAA0B;AACpC,UAAM;AACN,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAa,KAAa;AAC5B,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,KACE,UACA,eACA,cACe;AACf,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,eAAe;AAAA,IACjC;AAEA,UAAM,SAAS,IAAI,cAAc,UAAU,KAAK,cAAc,eAAe,YAAY;AAEzF,SAAK,eAAe,KAAK,SAAS,QAAQ,KAAK,YAAY;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,SAAS,QAAuB,SAA8D;AAC5F,WAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,SAAS,MAAM;AACnB,oBAAY,OAAO;AACnB,eAAO,kBAAkB,OAAO,iBAAiB,KAAK,aAAa;AAEnE,YAAI,OAAO,eAAe,YAAY,OAAO;AAC3C,iBAAO,aAAa,MAAM,IAAI;AAC9B,eAAK,aAAa,WAAW;AAAA,QAC/B;AAEA,YAAI,CAAC,YAAY;AACf,eAAK,KAAK,yBAAmC,OAAO,WAAW;AAAA,QACjE;AAEA,eAAO,QAAQ,QAAQ;AAEvB,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,aAAa,OAAO,YAAY,CAAC,EACpE,MAAM,kBAAkB;AAAA,MAC7B;AAEA,eAAS,MAAM;AACb,eAAO;AAAA,MACT,CAAC;AAED,UAAI,SAAS;AACX,kBAAM,+BAAiB,OAAO;AAAA,MAChC;AAEA,UAAI,KAAK,aAAa,iBAAiB,GAAG;AAExC,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,gBAAgB,KAAK,aAAa,eAAe,CAAC,EACrF,KAAK,+CAA+C;AAAA,MACzD;AAEA,UAAI,aAAa;AAGjB,YAAM,cAAc,IAAI,gCAAyB,OAAOC,UAASC,IAAGC,cAAa;AAC/E,YAAI,YAAY;AAChB,QAAAA,UAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,yBAAiB,SAAS,OAAO,eAAe;AAC9C,cAAI,aAAa,UAAU,oCAAgB,gBAAgB;AACzD;AAAA,UACF;AACA,cAAI,YAAY;AACd,iBAAK,QACF,MAAM,EAAE,UAAU,OAAO,SAAS,CAAC,EACnC,MAAM,gCAAgC;AACzC,iBAAK,KAAK,uBAAiC;AAC3C,mBAAO,aAAa,sBAAsB;AAC1C,yBAAa;AAAA,UACf;AACA,iBAAO,kBAAmB,MAAM,oBAAoB,MAAM,aAAc;AACxE,iBAAO,aAAa,UAAU,KAAK;AACnC,gBAAM,KAAK,aAAa,aAAa,KAAK;AAC1C,gBAAM,KAAK,aAAa,eAAe;AAAA,QACzC;AASA,eAAO,aAAa,MAAM,KAAK;AAC/B,QAAAF,SAAQ;AAAA,MACV,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,aAAa,OAAO,OAAO,KAAK,CAAC;AAAA,MACtD,UAAE;AACA,eAAO;AACP,gBAAQ;AAAA,MACV;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU;AACf,UAAM,KAAK;AAAA,EACb;AACF;","names":["AgentPlayoutEvent","EventEmitter","resolve","_","onCancel"]}
@@ -1,5 +1,6 @@
1
1
  import type { AudioFrame, AudioSource } from '@livekit/rtc-node';
2
2
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
3
+ import type { TextAudioSynchronizer } from '../transcription.js';
3
4
  import { Future } from '../utils.js';
4
5
  import { SynthesisHandle } from './agent_output.js';
5
6
  export declare enum AgentPlayoutEvent {
@@ -14,10 +15,11 @@ export declare class PlayoutHandle {
14
15
  #private;
15
16
  playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
16
17
  totalPlayedTime?: number;
18
+ synchronizer: TextAudioSynchronizer;
17
19
  pushedDuration: number;
18
20
  intFut: Future;
19
21
  doneFut: Future;
20
- constructor(speechId: string, audioSource: AudioSource, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>);
22
+ constructor(speechId: string, audioSource: AudioSource, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>, synchronizer: TextAudioSynchronizer);
21
23
  get speechId(): string;
22
24
  get interrupted(): boolean;
23
25
  get timePlayed(): number;
@@ -31,7 +33,7 @@ export declare class AgentPlayout extends AgentPlayout_base {
31
33
  constructor(audioSource: AudioSource);
32
34
  get targetVolume(): number;
33
35
  set targetVolume(vol: number);
34
- play(speechId: string, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>): PlayoutHandle;
36
+ play(speechId: string, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>, synchronizer: TextAudioSynchronizer): PlayoutHandle;
35
37
  close(): Promise<void>;
36
38
  }
37
39
  export {};
@@ -1 +1 @@
1
- {"version":3,"file":"agent_playout.d.ts","sourceRoot":"","sources":["../../src/pipeline/agent_playout.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACjE,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAGhF,OAAO,EAAsB,MAAM,EAAoB,MAAM,aAAa,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEpD,oBAAY,iBAAiB;IAC3B,eAAe,IAAA;IACf,eAAe,IAAA;CAChB;AAED,MAAM,MAAM,qBAAqB,GAAG;IAClC,CAAC,iBAAiB,CAAC,eAAe,CAAC,EAAE,MAAM,IAAI,CAAC;IAChD,CAAC,iBAAiB,CAAC,eAAe,CAAC,EAAE,CAAC,SAAS,EAAE,OAAO,KAAK,IAAI,CAAC;CACnE,CAAC;AAEF,qBAAa,aAAa;;IAGxB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,CAAC;IACjF,eAAe,CAAC,EAAE,MAAM,CAAC;IAEzB,cAAc,SAAK;IACnB,MAAM,SAAgB;IACtB,OAAO,SAAgB;gBAGrB,QAAQ,EAAE,MAAM,EAChB,WAAW,EAAE,WAAW,EACxB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC;IAOlF,IAAI,QAAQ,IAAI,MAAM,CAErB;IAED,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,IAAI,IAAI,IAAI,OAAO,CAElB;IAED,SAAS;IAST,IAAI,IAAI,MAAM;CAGf;2CAE4D,aAAa,qBAAqB,CAAC;AAAhG,qBAAa,YAAa,SAAQ,iBAA+D;;gBAOnF,WAAW,EAAE,WAAW;IAKpC,IAAI,YAAY,IAAI,MAAM,CAEzB;IAED,IAAI,YAAY,CAAC,GAAG,EAAE,MAAM,EAE3B;IAED,IAAI,CACF,QAAQ,EAAE,MAAM,EAChB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,GAC/E,aAAa;IA2FV,KAAK;CAIZ"}
1
+ {"version":3,"file":"agent_playout.d.ts","sourceRoot":"","sources":["../../src/pipeline/agent_playout.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACjE,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAGhF,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AACjE,OAAO,EAAsB,MAAM,EAAoB,MAAM,aAAa,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEpD,oBAAY,iBAAiB;IAC3B,eAAe,IAAA;IACf,eAAe,IAAA;CAChB;AAED,MAAM,MAAM,qBAAqB,GAAG;IAClC,CAAC,iBAAiB,CAAC,eAAe,CAAC,EAAE,MAAM,IAAI,CAAC;IAChD,CAAC,iBAAiB,CAAC,eAAe,CAAC,EAAE,CAAC,SAAS,EAAE,OAAO,KAAK,IAAI,CAAC;CACnE,CAAC;AAEF,qBAAa,aAAa;;IAGxB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,CAAC;IACjF,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,EAAE,qBAAqB,CAAC;IAEpC,cAAc,SAAK;IACnB,MAAM,SAAgB;IACtB,OAAO,SAAgB;gBAGrB,QAAQ,EAAE,MAAM,EAChB,WAAW,EAAE,WAAW,EACxB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,EAChF,YAAY,EAAE,qBAAqB;IAQrC,IAAI,QAAQ,IAAI,MAAM,CAErB;IAED,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,IAAI,IAAI,IAAI,OAAO,CAElB;IAED,SAAS;IAST,IAAI,IAAI,MAAM;CAGf;2CAE4D,aAAa,qBAAqB,CAAC;AAAhG,qBAAa,YAAa,SAAQ,iBAA+D;;gBAOnF,WAAW,EAAE,WAAW;IAKpC,IAAI,YAAY,IAAI,MAAM,CAEzB;IAED,IAAI,YAAY,CAAC,GAAG,EAAE,MAAM,EAE3B;IAED,IAAI,CACF,QAAQ,EAAE,MAAM,EAChB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,EAChF,YAAY,EAAE,qBAAqB,GAClC,aAAa;IA+FV,KAAK;CAIZ"}
@@ -12,14 +12,16 @@ class PlayoutHandle {
12
12
  #audioSource;
13
13
  playoutSource;
14
14
  totalPlayedTime;
15
+ synchronizer;
15
16
  #interrupted = false;
16
17
  pushedDuration = 0;
17
18
  intFut = new Future();
18
19
  doneFut = new Future();
19
- constructor(speechId, audioSource, playoutSource) {
20
+ constructor(speechId, audioSource, playoutSource, synchronizer) {
20
21
  this.#speechId = speechId;
21
22
  this.#audioSource = audioSource;
22
23
  this.playoutSource = playoutSource;
24
+ this.synchronizer = synchronizer;
23
25
  }
24
26
  get speechId() {
25
27
  return this.#speechId;
@@ -60,11 +62,11 @@ class AgentPlayout extends EventEmitter {
60
62
  set targetVolume(vol) {
61
63
  this.#targetVolume = vol;
62
64
  }
63
- play(speechId, playoutSource) {
65
+ play(speechId, playoutSource, synchronizer) {
64
66
  if (this.#closed) {
65
67
  throw new Error("source closed");
66
68
  }
67
- const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);
69
+ const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);
68
70
  this.#playoutTask = this.#playout(handle, this.#playoutTask);
69
71
  return handle;
70
72
  }
@@ -74,6 +76,7 @@ class AgentPlayout extends EventEmitter {
74
76
  captureTask.cancel();
75
77
  handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
76
78
  if (handle.interrupted || captureTask.error) {
79
+ handle.synchronizer.close(true);
77
80
  this.#audioSource.clearQueue();
78
81
  }
79
82
  if (!firstFrame) {
@@ -104,12 +107,15 @@ class AgentPlayout extends EventEmitter {
104
107
  if (firstFrame) {
105
108
  this.#logger.child({ speechId: handle.speechId }).debug("started playing the first time");
106
109
  this.emit(0 /* PLAYOUT_STARTED */);
110
+ handle.synchronizer.segmentPlayoutStarted();
107
111
  firstFrame = false;
108
112
  }
109
113
  handle.pushedDuration += frame.samplesPerChannel / frame.sampleRate * 1e3;
114
+ handle.synchronizer.pushAudio(frame);
110
115
  await this.#audioSource.captureFrame(frame);
111
116
  await this.#audioSource.waitForPlayout();
112
117
  }
118
+ handle.synchronizer.close(false);
113
119
  resolve2();
114
120
  });
115
121
  try {
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipeline/agent_playout.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame, AudioSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport { log } from '../log.js';\nimport { CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { SynthesisHandle } from './agent_output.js';\n\nexport enum AgentPlayoutEvent {\n PLAYOUT_STARTED,\n PLAYOUT_STOPPED,\n}\n\nexport type AgentPlayoutCallbacks = {\n [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;\n [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;\n};\n\nexport class PlayoutHandle {\n #speechId: string;\n #audioSource: AudioSource;\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;\n totalPlayedTime?: number;\n #interrupted = false;\n pushedDuration = 0;\n intFut = new Future();\n doneFut = new Future();\n\n constructor(\n speechId: string,\n audioSource: AudioSource,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n ) {\n this.#speechId = speechId;\n this.#audioSource = audioSource;\n this.playoutSource = playoutSource;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get interrupted(): boolean {\n return this.#interrupted;\n }\n\n get timePlayed(): number {\n return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;\n }\n\n get done(): boolean {\n return this.doneFut.done || this.#interrupted;\n }\n\n interrupt() {\n if (this.done) {\n return;\n }\n\n this.intFut.resolve();\n this.#interrupted = true;\n }\n\n join(): Future {\n return this.doneFut;\n }\n}\n\nexport class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {\n #closed = false;\n #audioSource: AudioSource;\n #targetVolume = 1;\n #playoutTask?: CancellablePromise<void>;\n #logger = log();\n\n constructor(audioSource: AudioSource) {\n super();\n this.#audioSource = audioSource;\n }\n\n get targetVolume(): number {\n return this.#targetVolume;\n }\n\n set targetVolume(vol: number) {\n this.#targetVolume = vol;\n }\n\n play(\n speechId: string,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n ): PlayoutHandle {\n if (this.#closed) {\n throw new Error('source closed');\n }\n\n const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);\n\n this.#playoutTask = this.#playout(handle, this.#playoutTask);\n return handle;\n }\n\n #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const cancel = () => {\n captureTask.cancel();\n handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;\n\n if (handle.interrupted || captureTask.error) {\n this.#audioSource.clearQueue(); // make sure to remove any queued frames\n }\n\n if (!firstFrame) {\n this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);\n }\n\n handle.doneFut.resolve();\n\n this.#logger\n .child({ speechId: handle.speechId, interrupted: handle.interrupted })\n .debug('playout finished');\n };\n\n onCancel(() => {\n cancel();\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n if (this.#audioSource.queuedDuration > 0) {\n // this should not happen, but log it just in case\n this.#logger\n .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })\n .warn('new playout while the source is still playing');\n }\n\n let firstFrame = true;\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n for await (const frame of handle.playoutSource) {\n if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {\n break;\n }\n if (firstFrame) {\n this.#logger\n .child({ speechId: handle.speechId })\n .debug('started playing the first time');\n this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);\n firstFrame = false;\n }\n handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;\n await this.#audioSource.captureFrame(frame);\n await this.#audioSource.waitForPlayout();\n }\n\n // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,\n // but for some reason too many TTS frames can gunk up the buffer and lead to\n // FFI errors. this works 🤷‍♀️\n // if (this.#audioSource.queuedDuration > 0) {\n // await this.#audioSource.waitForPlayout();\n // }\n\n resolve();\n });\n\n try {\n await Promise.any([captureTask, handle.intFut.await]);\n } finally {\n cancel();\n resolve();\n }\n });\n }\n\n async close() {\n this.#closed = true;\n await this.#playoutTask;\n }\n}\n"],"mappings":"AAKA,OAAO,kBAAkB;AACzB,SAAS,WAAW;AACpB,SAAS,oBAAoB,QAAQ,wBAAwB;AAC7D,SAAS,uBAAuB;AAEzB,IAAK,oBAAL,kBAAKA,uBAAL;AACL,EAAAA,sCAAA;AACA,EAAAA,sCAAA;AAFU,SAAAA;AAAA,GAAA;AAUL,MAAM,cAAc;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,eAAe;AAAA,EACf,iBAAiB;AAAA,EACjB,SAAS,IAAI,OAAO;AAAA,EACpB,UAAU,IAAI,OAAO;AAAA,EAErB,YACE,UACA,aACA,eACA;AACA,SAAK,YAAY;AACjB,SAAK,eAAe;AACpB,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK,mBAAmB,KAAK,iBAAiB,KAAK,aAAa;AAAA,EACzE;AAAA,EAEA,IAAI,OAAgB;AAClB,WAAO,KAAK,QAAQ,QAAQ,KAAK;AAAA,EACnC;AAAA,EAEA,YAAY;AACV,QAAI,KAAK,MAAM;AACb;AAAA,IACF;AAEA,SAAK,OAAO,QAAQ;AACpB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,OAAe;AACb,WAAO,KAAK;AAAA,EACd;AACF;AAEO,MAAM,qBAAsB,aAA+D;AAAA,EAChG,UAAU;AAAA,EACV;AAAA,EACA,gBAAgB;AAAA,EAChB;AAAA,EACA,UAAU,IAAI;AAAA,EAEd,YAAY,aAA0B;AACpC,UAAM;AACN,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAa,KAAa;AAC5B,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,KACE,UACA,eACe;AACf,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,eAAe;AAAA,IACjC;AAEA,UAAM,SAAS,IAAI,cAAc,UAAU,KAAK,cAAc,aAAa;AAE3E,SAAK,eAAe,KAAK,SAAS,QAAQ,KAAK,YAAY;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,SAAS,QAAuB,SAA8D;AAC5F,WAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,SAAS,MAAM;AACnB,oBAAY,OAAO;AACnB,eAAO,kBAAkB,OAAO,iBAAiB,KAAK,aAAa;AAEnE,YAAI,OAAO,eAAe,YAAY,OAAO;AAC3C,eAAK,aAAa,WAAW;AAAA,QAC/B;AAEA,YAAI,CAAC,YAAY;AACf,eAAK,KAAK,yBAAmC,OAAO,WAAW;AAAA,QACjE;AAEA,eAAO,QAAQ,QAAQ;AAEvB,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,aAAa,OAAO,YAAY,CAAC,EACpE,MAAM,kBAAkB;AAAA,MAC7B;AAEA,eAAS,MAAM;AACb,eAAO;AAAA,MACT,CAAC;AAED,UAAI,SAAS;AACX,cAAM,iBAAiB,OAAO;AAAA,MAChC;AAEA,UAAI,KAAK,aAAa,iBAAiB,GAAG;AAExC,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,gBAAgB,KAAK,aAAa,eAAe,CAAC,EACrF,KAAK,+CAA+C;AAAA,MACzD;AAEA,UAAI,aAAa;AAGjB,YAAM,cAAc,IAAI,mBAAyB,OAAOC,UAASC,IAAGC,cAAa;AAC/E,YAAI,YAAY;AAChB,QAAAA,UAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,yBAAiB,SAAS,OAAO,eAAe;AAC9C,cAAI,aAAa,UAAU,gBAAgB,gBAAgB;AACzD;AAAA,UACF;AACA,cAAI,YAAY;AACd,iBAAK,QACF,MAAM,EAAE,UAAU,OAAO,SAAS,CAAC,EACnC,MAAM,gCAAgC;AACzC,iBAAK,KAAK,uBAAiC;AAC3C,yBAAa;AAAA,UACf;AACA,iBAAO,kBAAmB,MAAM,oBAAoB,MAAM,aAAc;AACxE,gBAAM,KAAK,aAAa,aAAa,KAAK;AAC1C,gBAAM,KAAK,aAAa,eAAe;AAAA,QACzC;AASA,QAAAF,SAAQ;AAAA,MACV,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,aAAa,OAAO,OAAO,KAAK,CAAC;AAAA,MACtD,UAAE;AACA,eAAO;AACP,gBAAQ;AAAA,MACV;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU;AACf,UAAM,KAAK;AAAA,EACb;AACF;","names":["AgentPlayoutEvent","resolve","_","onCancel"]}
1
+ {"version":3,"sources":["../../src/pipeline/agent_playout.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame, AudioSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport { log } from '../log.js';\nimport type { TextAudioSynchronizer } from '../transcription.js';\nimport { CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { SynthesisHandle } from './agent_output.js';\n\nexport enum AgentPlayoutEvent {\n PLAYOUT_STARTED,\n PLAYOUT_STOPPED,\n}\n\nexport type AgentPlayoutCallbacks = {\n [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;\n [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;\n};\n\nexport class PlayoutHandle {\n #speechId: string;\n #audioSource: AudioSource;\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;\n totalPlayedTime?: number;\n synchronizer: TextAudioSynchronizer;\n #interrupted = false;\n pushedDuration = 0;\n intFut = new Future();\n doneFut = new Future();\n\n constructor(\n speechId: string,\n audioSource: AudioSource,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n synchronizer: TextAudioSynchronizer,\n ) {\n this.#speechId = speechId;\n this.#audioSource = audioSource;\n this.playoutSource = playoutSource;\n this.synchronizer = synchronizer;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get interrupted(): boolean {\n return this.#interrupted;\n }\n\n get timePlayed(): number {\n return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;\n }\n\n get done(): boolean {\n return this.doneFut.done || this.#interrupted;\n }\n\n interrupt() {\n if (this.done) {\n return;\n }\n\n this.intFut.resolve();\n this.#interrupted = true;\n }\n\n join(): Future {\n return this.doneFut;\n }\n}\n\nexport class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {\n #closed = false;\n #audioSource: AudioSource;\n #targetVolume = 1;\n #playoutTask?: CancellablePromise<void>;\n #logger = log();\n\n constructor(audioSource: AudioSource) {\n super();\n this.#audioSource = audioSource;\n }\n\n get targetVolume(): number {\n return this.#targetVolume;\n }\n\n set targetVolume(vol: number) {\n this.#targetVolume = vol;\n }\n\n play(\n speechId: string,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n synchronizer: TextAudioSynchronizer,\n ): PlayoutHandle {\n if (this.#closed) {\n throw new Error('source closed');\n }\n\n const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);\n\n this.#playoutTask = this.#playout(handle, this.#playoutTask);\n return handle;\n }\n\n #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const cancel = () => {\n captureTask.cancel();\n handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;\n\n if (handle.interrupted || captureTask.error) {\n handle.synchronizer.close(true);\n this.#audioSource.clearQueue(); // make sure to remove any queued frames\n }\n\n if (!firstFrame) {\n this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);\n }\n\n handle.doneFut.resolve();\n\n this.#logger\n .child({ speechId: handle.speechId, interrupted: handle.interrupted })\n .debug('playout finished');\n };\n\n onCancel(() => {\n cancel();\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n if (this.#audioSource.queuedDuration > 0) {\n // this should not happen, but log it just in case\n this.#logger\n .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })\n .warn('new playout while the source is still playing');\n }\n\n let firstFrame = true;\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n for await (const frame of handle.playoutSource) {\n if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {\n break;\n }\n if (firstFrame) {\n this.#logger\n .child({ speechId: handle.speechId })\n .debug('started playing the first time');\n this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);\n handle.synchronizer.segmentPlayoutStarted();\n firstFrame = false;\n }\n handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;\n handle.synchronizer.pushAudio(frame);\n await this.#audioSource.captureFrame(frame);\n await this.#audioSource.waitForPlayout();\n }\n\n // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,\n // but for some reason too many TTS frames can gunk up the buffer and lead to\n // FFI errors. this works 🤷‍♀️\n // if (this.#audioSource.queuedDuration > 0) {\n // await this.#audioSource.waitForPlayout();\n // }\n\n handle.synchronizer.close(false);\n resolve();\n });\n\n try {\n await Promise.any([captureTask, handle.intFut.await]);\n } finally {\n cancel();\n resolve();\n }\n });\n }\n\n async close() {\n this.#closed = true;\n await this.#playoutTask;\n }\n}\n"],"mappings":"AAKA,OAAO,kBAAkB;AACzB,SAAS,WAAW;AAEpB,SAAS,oBAAoB,QAAQ,wBAAwB;AAC7D,SAAS,uBAAuB;AAEzB,IAAK,oBAAL,kBAAKA,uBAAL;AACL,EAAAA,sCAAA;AACA,EAAAA,sCAAA;AAFU,SAAAA;AAAA,GAAA;AAUL,MAAM,cAAc;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,eAAe;AAAA,EACf,iBAAiB;AAAA,EACjB,SAAS,IAAI,OAAO;AAAA,EACpB,UAAU,IAAI,OAAO;AAAA,EAErB,YACE,UACA,aACA,eACA,cACA;AACA,SAAK,YAAY;AACjB,SAAK,eAAe;AACpB,SAAK,gBAAgB;AACrB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK,mBAAmB,KAAK,iBAAiB,KAAK,aAAa;AAAA,EACzE;AAAA,EAEA,IAAI,OAAgB;AAClB,WAAO,KAAK,QAAQ,QAAQ,KAAK;AAAA,EACnC;AAAA,EAEA,YAAY;AACV,QAAI,KAAK,MAAM;AACb;AAAA,IACF;AAEA,SAAK,OAAO,QAAQ;AACpB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,OAAe;AACb,WAAO,KAAK;AAAA,EACd;AACF;AAEO,MAAM,qBAAsB,aAA+D;AAAA,EAChG,UAAU;AAAA,EACV;AAAA,EACA,gBAAgB;AAAA,EAChB;AAAA,EACA,UAAU,IAAI;AAAA,EAEd,YAAY,aAA0B;AACpC,UAAM;AACN,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAa,KAAa;AAC5B,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,KACE,UACA,eACA,cACe;AACf,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,eAAe;AAAA,IACjC;AAEA,UAAM,SAAS,IAAI,cAAc,UAAU,KAAK,cAAc,eAAe,YAAY;AAEzF,SAAK,eAAe,KAAK,SAAS,QAAQ,KAAK,YAAY;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,SAAS,QAAuB,SAA8D;AAC5F,WAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,SAAS,MAAM;AACnB,oBAAY,OAAO;AACnB,eAAO,kBAAkB,OAAO,iBAAiB,KAAK,aAAa;AAEnE,YAAI,OAAO,eAAe,YAAY,OAAO;AAC3C,iBAAO,aAAa,MAAM,IAAI;AAC9B,eAAK,aAAa,WAAW;AAAA,QAC/B;AAEA,YAAI,CAAC,YAAY;AACf,eAAK,KAAK,yBAAmC,OAAO,WAAW;AAAA,QACjE;AAEA,eAAO,QAAQ,QAAQ;AAEvB,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,aAAa,OAAO,YAAY,CAAC,EACpE,MAAM,kBAAkB;AAAA,MAC7B;AAEA,eAAS,MAAM;AACb,eAAO;AAAA,MACT,CAAC;AAED,UAAI,SAAS;AACX,cAAM,iBAAiB,OAAO;AAAA,MAChC;AAEA,UAAI,KAAK,aAAa,iBAAiB,GAAG;AAExC,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,gBAAgB,KAAK,aAAa,eAAe,CAAC,EACrF,KAAK,+CAA+C;AAAA,MACzD;AAEA,UAAI,aAAa;AAGjB,YAAM,cAAc,IAAI,mBAAyB,OAAOC,UAASC,IAAGC,cAAa;AAC/E,YAAI,YAAY;AAChB,QAAAA,UAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,yBAAiB,SAAS,OAAO,eAAe;AAC9C,cAAI,aAAa,UAAU,gBAAgB,gBAAgB;AACzD;AAAA,UACF;AACA,cAAI,YAAY;AACd,iBAAK,QACF,MAAM,EAAE,UAAU,OAAO,SAAS,CAAC,EACnC,MAAM,gCAAgC;AACzC,iBAAK,KAAK,uBAAiC;AAC3C,mBAAO,aAAa,sBAAsB;AAC1C,yBAAa;AAAA,UACf;AACA,iBAAO,kBAAmB,MAAM,oBAAoB,MAAM,aAAc;AACxE,iBAAO,aAAa,UAAU,KAAK;AACnC,gBAAM,KAAK,aAAa,aAAa,KAAK;AAC1C,gBAAM,KAAK,aAAa,eAAe;AAAA,QACzC;AASA,eAAO,aAAa,MAAM,KAAK;AAC/B,QAAAF,SAAQ;AAAA,MACV,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,aAAa,OAAO,OAAO,KAAK,CAAC;AAAA,MACtD,UAAE;AACA,eAAO;AACP,gBAAQ;AAAA,MACV;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU;AACf,UAAM,KAAK;AAAA,EACb;AACF;","names":["AgentPlayoutEvent","resolve","_","onCancel"]}