@livekit/agents 0.6.4 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/dist/cli.cjs +8 -0
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +8 -0
  5. package/dist/cli.js.map +1 -1
  6. package/dist/index.cjs +6 -1
  7. package/dist/index.cjs.map +1 -1
  8. package/dist/index.d.ts +3 -1
  9. package/dist/index.d.ts.map +1 -1
  10. package/dist/index.js +3 -0
  11. package/dist/index.js.map +1 -1
  12. package/dist/inference_runner.cjs +38 -0
  13. package/dist/inference_runner.cjs.map +1 -0
  14. package/dist/inference_runner.d.ts +11 -0
  15. package/dist/inference_runner.d.ts.map +1 -0
  16. package/dist/inference_runner.js +14 -0
  17. package/dist/inference_runner.js.map +1 -0
  18. package/dist/ipc/index.cjs +23 -0
  19. package/dist/ipc/index.cjs.map +1 -0
  20. package/dist/ipc/index.d.ts +2 -0
  21. package/dist/ipc/index.d.ts.map +1 -0
  22. package/dist/ipc/index.js +2 -0
  23. package/dist/ipc/index.js.map +1 -0
  24. package/dist/ipc/inference_executor.cjs +17 -0
  25. package/dist/ipc/inference_executor.cjs.map +1 -0
  26. package/dist/ipc/inference_executor.d.ts +4 -0
  27. package/dist/ipc/inference_executor.d.ts.map +1 -0
  28. package/dist/ipc/inference_executor.js +1 -0
  29. package/dist/ipc/inference_executor.js.map +1 -0
  30. package/dist/ipc/inference_proc_executor.cjs +97 -0
  31. package/dist/ipc/inference_proc_executor.cjs.map +1 -0
  32. package/dist/ipc/inference_proc_executor.d.ts +23 -0
  33. package/dist/ipc/inference_proc_executor.d.ts.map +1 -0
  34. package/dist/ipc/inference_proc_executor.js +72 -0
  35. package/dist/ipc/inference_proc_executor.js.map +1 -0
  36. package/dist/ipc/inference_proc_lazy_main.cjs +92 -0
  37. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -0
  38. package/dist/ipc/inference_proc_lazy_main.d.ts +2 -0
  39. package/dist/ipc/inference_proc_lazy_main.d.ts.map +1 -0
  40. package/dist/ipc/inference_proc_lazy_main.js +69 -0
  41. package/dist/ipc/inference_proc_lazy_main.js.map +1 -0
  42. package/dist/ipc/job_executor.cjs +8 -7
  43. package/dist/ipc/job_executor.cjs.map +1 -1
  44. package/dist/ipc/job_executor.d.ts +14 -15
  45. package/dist/ipc/job_executor.d.ts.map +1 -1
  46. package/dist/ipc/job_executor.js +7 -6
  47. package/dist/ipc/job_executor.js.map +1 -1
  48. package/dist/ipc/job_proc_executor.cjs +108 -0
  49. package/dist/ipc/job_proc_executor.cjs.map +1 -0
  50. package/dist/ipc/job_proc_executor.d.ts +19 -0
  51. package/dist/ipc/job_proc_executor.d.ts.map +1 -0
  52. package/dist/ipc/job_proc_executor.js +83 -0
  53. package/dist/ipc/job_proc_executor.js.map +1 -0
  54. package/dist/ipc/{job_main.cjs → job_proc_lazy_main.cjs} +46 -36
  55. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -0
  56. package/dist/ipc/job_proc_lazy_main.d.ts +2 -0
  57. package/dist/ipc/job_proc_lazy_main.d.ts.map +1 -0
  58. package/dist/ipc/{job_main.js → job_proc_lazy_main.js} +46 -11
  59. package/dist/ipc/job_proc_lazy_main.js.map +1 -0
  60. package/dist/ipc/message.cjs.map +1 -1
  61. package/dist/ipc/message.d.ts +17 -0
  62. package/dist/ipc/message.d.ts.map +1 -1
  63. package/dist/ipc/proc_pool.cjs +30 -4
  64. package/dist/ipc/proc_pool.cjs.map +1 -1
  65. package/dist/ipc/proc_pool.d.ts +5 -1
  66. package/dist/ipc/proc_pool.d.ts.map +1 -1
  67. package/dist/ipc/proc_pool.js +30 -4
  68. package/dist/ipc/proc_pool.js.map +1 -1
  69. package/dist/ipc/{proc_job_executor.cjs → supervised_proc.cjs} +57 -45
  70. package/dist/ipc/supervised_proc.cjs.map +1 -0
  71. package/dist/ipc/supervised_proc.d.ts +30 -0
  72. package/dist/ipc/supervised_proc.d.ts.map +1 -0
  73. package/dist/ipc/{proc_job_executor.js → supervised_proc.js} +53 -31
  74. package/dist/ipc/supervised_proc.js.map +1 -0
  75. package/dist/job.cjs +18 -1
  76. package/dist/job.cjs.map +1 -1
  77. package/dist/job.d.ts +9 -1
  78. package/dist/job.d.ts.map +1 -1
  79. package/dist/job.js +17 -1
  80. package/dist/job.js.map +1 -1
  81. package/dist/multimodal/agent_playout.cjs +18 -16
  82. package/dist/multimodal/agent_playout.cjs.map +1 -1
  83. package/dist/multimodal/agent_playout.d.ts +4 -4
  84. package/dist/multimodal/agent_playout.d.ts.map +1 -1
  85. package/dist/multimodal/agent_playout.js +18 -16
  86. package/dist/multimodal/agent_playout.js.map +1 -1
  87. package/dist/multimodal/multimodal_agent.cjs +12 -8
  88. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  89. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  90. package/dist/multimodal/multimodal_agent.js +13 -9
  91. package/dist/multimodal/multimodal_agent.js.map +1 -1
  92. package/dist/pipeline/agent_output.cjs +22 -4
  93. package/dist/pipeline/agent_output.cjs.map +1 -1
  94. package/dist/pipeline/agent_output.d.ts +4 -2
  95. package/dist/pipeline/agent_output.d.ts.map +1 -1
  96. package/dist/pipeline/agent_output.js +22 -4
  97. package/dist/pipeline/agent_output.js.map +1 -1
  98. package/dist/pipeline/agent_playout.cjs +9 -3
  99. package/dist/pipeline/agent_playout.cjs.map +1 -1
  100. package/dist/pipeline/agent_playout.d.ts +4 -2
  101. package/dist/pipeline/agent_playout.d.ts.map +1 -1
  102. package/dist/pipeline/agent_playout.js +9 -3
  103. package/dist/pipeline/agent_playout.js.map +1 -1
  104. package/dist/pipeline/human_input.cjs +6 -0
  105. package/dist/pipeline/human_input.cjs.map +1 -1
  106. package/dist/pipeline/human_input.d.ts +3 -1
  107. package/dist/pipeline/human_input.d.ts.map +1 -1
  108. package/dist/pipeline/human_input.js +6 -0
  109. package/dist/pipeline/human_input.js.map +1 -1
  110. package/dist/pipeline/pipeline_agent.cjs +79 -12
  111. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  112. package/dist/pipeline/pipeline_agent.d.ts +8 -0
  113. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  114. package/dist/pipeline/pipeline_agent.js +79 -12
  115. package/dist/pipeline/pipeline_agent.js.map +1 -1
  116. package/dist/stt/stream_adapter.cjs +16 -4
  117. package/dist/stt/stream_adapter.cjs.map +1 -1
  118. package/dist/stt/stream_adapter.d.ts.map +1 -1
  119. package/dist/stt/stream_adapter.js +16 -4
  120. package/dist/stt/stream_adapter.js.map +1 -1
  121. package/dist/tokenize/basic/basic.cjs +2 -0
  122. package/dist/tokenize/basic/basic.cjs.map +1 -1
  123. package/dist/tokenize/basic/basic.d.ts +2 -0
  124. package/dist/tokenize/basic/basic.d.ts.map +1 -1
  125. package/dist/tokenize/basic/basic.js +1 -0
  126. package/dist/tokenize/basic/basic.js.map +1 -1
  127. package/dist/tokenize/basic/index.cjs +2 -0
  128. package/dist/tokenize/basic/index.cjs.map +1 -1
  129. package/dist/tokenize/basic/index.d.ts +1 -1
  130. package/dist/tokenize/basic/index.d.ts.map +1 -1
  131. package/dist/tokenize/basic/index.js +8 -1
  132. package/dist/tokenize/basic/index.js.map +1 -1
  133. package/dist/tokenize/token_stream.cjs +5 -3
  134. package/dist/tokenize/token_stream.cjs.map +1 -1
  135. package/dist/tokenize/token_stream.d.ts.map +1 -1
  136. package/dist/tokenize/token_stream.js +5 -3
  137. package/dist/tokenize/token_stream.js.map +1 -1
  138. package/dist/transcription.cjs +203 -86
  139. package/dist/transcription.cjs.map +1 -1
  140. package/dist/transcription.d.ts +24 -17
  141. package/dist/transcription.d.ts.map +1 -1
  142. package/dist/transcription.js +201 -85
  143. package/dist/transcription.js.map +1 -1
  144. package/dist/worker.cjs +42 -9
  145. package/dist/worker.cjs.map +1 -1
  146. package/dist/worker.d.ts +5 -1
  147. package/dist/worker.d.ts.map +1 -1
  148. package/dist/worker.js +42 -9
  149. package/dist/worker.js.map +1 -1
  150. package/package.json +3 -3
  151. package/src/cli.ts +9 -0
  152. package/src/index.ts +3 -1
  153. package/src/inference_runner.ts +19 -0
  154. package/src/ipc/index.ts +5 -0
  155. package/src/ipc/inference_executor.ts +7 -0
  156. package/src/ipc/inference_proc_executor.ts +93 -0
  157. package/src/ipc/inference_proc_lazy_main.ts +90 -0
  158. package/src/ipc/job_executor.ts +15 -17
  159. package/src/ipc/job_proc_executor.ts +112 -0
  160. package/src/ipc/{job_main.ts → job_proc_lazy_main.ts} +52 -14
  161. package/src/ipc/message.ts +14 -1
  162. package/src/ipc/proc_pool.ts +33 -3
  163. package/src/ipc/{proc_job_executor.ts → supervised_proc.ts} +77 -29
  164. package/src/job.ts +21 -0
  165. package/src/multimodal/agent_playout.ts +19 -18
  166. package/src/multimodal/multimodal_agent.ts +13 -9
  167. package/src/pipeline/agent_output.ts +36 -5
  168. package/src/pipeline/agent_playout.ts +10 -1
  169. package/src/pipeline/human_input.ts +8 -0
  170. package/src/pipeline/pipeline_agent.ts +96 -11
  171. package/src/stt/stream_adapter.ts +17 -5
  172. package/src/tokenize/basic/basic.ts +2 -0
  173. package/src/tokenize/basic/index.ts +7 -1
  174. package/src/tokenize/token_stream.ts +6 -3
  175. package/src/transcription.ts +270 -96
  176. package/src/worker.ts +42 -5
  177. package/dist/ipc/job_main.cjs.map +0 -1
  178. package/dist/ipc/job_main.d.ts +0 -8
  179. package/dist/ipc/job_main.d.ts.map +0 -1
  180. package/dist/ipc/job_main.js.map +0 -1
  181. package/dist/ipc/proc_job_executor.cjs.map +0 -1
  182. package/dist/ipc/proc_job_executor.d.ts +0 -15
  183. package/dist/ipc/proc_job_executor.d.ts.map +0 -1
  184. package/dist/ipc/proc_job_executor.js.map +0 -1
@@ -47,14 +47,16 @@ class PlayoutHandle {
47
47
  #audioSource;
48
48
  playoutSource;
49
49
  totalPlayedTime;
50
+ synchronizer;
50
51
  #interrupted = false;
51
52
  pushedDuration = 0;
52
53
  intFut = new import_utils.Future();
53
54
  doneFut = new import_utils.Future();
54
- constructor(speechId, audioSource, playoutSource) {
55
+ constructor(speechId, audioSource, playoutSource, synchronizer) {
55
56
  this.#speechId = speechId;
56
57
  this.#audioSource = audioSource;
57
58
  this.playoutSource = playoutSource;
59
+ this.synchronizer = synchronizer;
58
60
  }
59
61
  get speechId() {
60
62
  return this.#speechId;
@@ -95,11 +97,11 @@ class AgentPlayout extends import_node_events.default {
95
97
  set targetVolume(vol) {
96
98
  this.#targetVolume = vol;
97
99
  }
98
- play(speechId, playoutSource) {
100
+ play(speechId, playoutSource, synchronizer) {
99
101
  if (this.#closed) {
100
102
  throw new Error("source closed");
101
103
  }
102
- const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);
104
+ const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);
103
105
  this.#playoutTask = this.#playout(handle, this.#playoutTask);
104
106
  return handle;
105
107
  }
@@ -109,6 +111,7 @@ class AgentPlayout extends import_node_events.default {
109
111
  captureTask.cancel();
110
112
  handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
111
113
  if (handle.interrupted || captureTask.error) {
114
+ handle.synchronizer.close(true);
112
115
  this.#audioSource.clearQueue();
113
116
  }
114
117
  if (!firstFrame) {
@@ -139,12 +142,15 @@ class AgentPlayout extends import_node_events.default {
139
142
  if (firstFrame) {
140
143
  this.#logger.child({ speechId: handle.speechId }).debug("started playing the first time");
141
144
  this.emit(0 /* PLAYOUT_STARTED */);
145
+ handle.synchronizer.segmentPlayoutStarted();
142
146
  firstFrame = false;
143
147
  }
144
148
  handle.pushedDuration += frame.samplesPerChannel / frame.sampleRate * 1e3;
149
+ handle.synchronizer.pushAudio(frame);
145
150
  await this.#audioSource.captureFrame(frame);
146
151
  await this.#audioSource.waitForPlayout();
147
152
  }
153
+ handle.synchronizer.close(false);
148
154
  resolve2();
149
155
  });
150
156
  try {
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipeline/agent_playout.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame, AudioSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport { log } from '../log.js';\nimport { CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { SynthesisHandle } from './agent_output.js';\n\nexport enum AgentPlayoutEvent {\n PLAYOUT_STARTED,\n PLAYOUT_STOPPED,\n}\n\nexport type AgentPlayoutCallbacks = {\n [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;\n [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;\n};\n\nexport class PlayoutHandle {\n #speechId: string;\n #audioSource: AudioSource;\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;\n totalPlayedTime?: number;\n #interrupted = false;\n pushedDuration = 0;\n intFut = new Future();\n doneFut = new Future();\n\n constructor(\n speechId: string,\n audioSource: AudioSource,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n ) {\n this.#speechId = speechId;\n this.#audioSource = audioSource;\n this.playoutSource = playoutSource;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get interrupted(): boolean {\n return this.#interrupted;\n }\n\n get timePlayed(): number {\n return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;\n }\n\n get done(): boolean {\n return this.doneFut.done || this.#interrupted;\n }\n\n interrupt() {\n if (this.done) {\n return;\n }\n\n this.intFut.resolve();\n this.#interrupted = true;\n }\n\n join(): Future {\n return this.doneFut;\n }\n}\n\nexport class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {\n #closed = false;\n #audioSource: AudioSource;\n #targetVolume = 1;\n #playoutTask?: CancellablePromise<void>;\n #logger = log();\n\n constructor(audioSource: AudioSource) {\n super();\n this.#audioSource = audioSource;\n }\n\n get targetVolume(): number {\n return this.#targetVolume;\n }\n\n set targetVolume(vol: number) {\n this.#targetVolume = vol;\n }\n\n play(\n speechId: string,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n ): PlayoutHandle {\n if (this.#closed) {\n throw new Error('source closed');\n }\n\n const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);\n\n this.#playoutTask = this.#playout(handle, this.#playoutTask);\n return handle;\n }\n\n #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const cancel = () => {\n captureTask.cancel();\n handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;\n\n if (handle.interrupted || captureTask.error) {\n this.#audioSource.clearQueue(); // make sure to remove any queued frames\n }\n\n if (!firstFrame) {\n this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);\n }\n\n handle.doneFut.resolve();\n\n this.#logger\n .child({ speechId: handle.speechId, interrupted: handle.interrupted })\n .debug('playout finished');\n };\n\n onCancel(() => {\n cancel();\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n if (this.#audioSource.queuedDuration > 0) {\n // this should not happen, but log it just in case\n this.#logger\n .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })\n .warn('new playout while the source is still playing');\n }\n\n let firstFrame = true;\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n for await (const frame of handle.playoutSource) {\n if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {\n break;\n }\n if (firstFrame) {\n this.#logger\n .child({ speechId: handle.speechId })\n .debug('started playing the first time');\n this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);\n firstFrame = false;\n }\n handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;\n await this.#audioSource.captureFrame(frame);\n await this.#audioSource.waitForPlayout();\n }\n\n // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,\n // but for some reason too many TTS frames can gunk up the buffer and lead to\n // FFI errors. this works 🤷‍♀️\n // if (this.#audioSource.queuedDuration > 0) {\n // await this.#audioSource.waitForPlayout();\n // }\n\n resolve();\n });\n\n try {\n await Promise.any([captureTask, handle.intFut.await]);\n } finally {\n cancel();\n resolve();\n }\n });\n }\n\n async close() {\n this.#closed = true;\n await this.#playoutTask;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAAyB;AACzB,iBAAoB;AACpB,mBAA6D;AAC7D,0BAAgC;AAEzB,IAAK,oBAAL,kBAAKA,uBAAL;AACL,EAAAA,sCAAA;AACA,EAAAA,sCAAA;AAFU,SAAAA;AAAA,GAAA;AAUL,MAAM,cAAc;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,eAAe;AAAA,EACf,iBAAiB;AAAA,EACjB,SAAS,IAAI,oBAAO;AAAA,EACpB,UAAU,IAAI,oBAAO;AAAA,EAErB,YACE,UACA,aACA,eACA;AACA,SAAK,YAAY;AACjB,SAAK,eAAe;AACpB,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK,mBAAmB,KAAK,iBAAiB,KAAK,aAAa;AAAA,EACzE;AAAA,EAEA,IAAI,OAAgB;AAClB,WAAO,KAAK,QAAQ,QAAQ,KAAK;AAAA,EACnC;AAAA,EAEA,YAAY;AACV,QAAI,KAAK,MAAM;AACb;AAAA,IACF;AAEA,SAAK,OAAO,QAAQ;AACpB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,OAAe;AACb,WAAO,KAAK;AAAA,EACd;AACF;AAEO,MAAM,qBAAsB,mBAAAC,QAA+D;AAAA,EAChG,UAAU;AAAA,EACV;AAAA,EACA,gBAAgB;AAAA,EAChB;AAAA,EACA,cAAU,gBAAI;AAAA,EAEd,YAAY,aAA0B;AACpC,UAAM;AACN,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAa,KAAa;AAC5B,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,KACE,UACA,eACe;AACf,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,eAAe;AAAA,IACjC;AAEA,UAAM,SAAS,IAAI,cAAc,UAAU,KAAK,cAAc,aAAa;AAE3E,SAAK,eAAe,KAAK,SAAS,QAAQ,KAAK,YAAY;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,SAAS,QAAuB,SAA8D;AAC5F,WAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,SAAS,MAAM;AACnB,oBAAY,OAAO;AACnB,eAAO,kBAAkB,OAAO,iBAAiB,KAAK,aAAa;AAEnE,YAAI,OAAO,eAAe,YAAY,OAAO;AAC3C,eAAK,aAAa,WAAW;AAAA,QAC/B;AAEA,YAAI,CAAC,YAAY;AACf,eAAK,KAAK,yBAAmC,OAAO,WAAW;AAAA,QACjE;AAEA,eAAO,QAAQ,QAAQ;AAEvB,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,aAAa,OAAO,YAAY,CAAC,EACpE,MAAM,kBAAkB;AAAA,MAC7B;AAEA,eAAS,MAAM;AACb,eAAO;AAAA,MACT,CAAC;AAED,UAAI,SAAS;AACX,kBAAM,+BAAiB,OAAO;AAAA,MAChC;AAEA,UAAI,KAAK,aAAa,iBAAiB,GAAG;AAExC,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,gBAAgB,KAAK,aAAa,eAAe,CAAC,EACrF,KAAK,+CAA+C;AAAA,MACzD;AAEA,UAAI,aAAa;AAGjB,YAAM,cAAc,IAAI,gCAAyB,OAAOC,UAASC,IAAGC,cAAa;AAC/E,YAAI,YAAY;AAChB,QAAAA,UAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,yBAAiB,SAAS,OAAO,eAAe;AAC9C,cAAI,aAAa,UAAU,oCAAgB,gBAAgB;AACzD;AAAA,UACF;AACA,cAAI,YAAY;AACd,iBAAK,QACF,MAAM,EAAE,UAAU,OAAO,SAAS,CAAC,EACnC,MAAM,gCAAgC;AACzC,iBAAK,KAAK,uBAAiC;AAC3C,yBAAa;AAAA,UACf;AACA,iBAAO,kBAAmB,MAAM,oBAAoB,MAAM,aAAc;AACxE,gBAAM,KAAK,aAAa,aAAa,KAAK;AAC1C,gBAAM,KAAK,aAAa,eAAe;AAAA,QACzC;AASA,QAAAF,SAAQ;AAAA,MACV,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,aAAa,OAAO,OAAO,KAAK,CAAC;AAAA,MACtD,UAAE;AACA,eAAO;AACP,gBAAQ;AAAA,MACV;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU;AACf,UAAM,KAAK;AAAA,EACb;AACF;","names":["AgentPlayoutEvent","EventEmitter","resolve","_","onCancel"]}
1
+ {"version":3,"sources":["../../src/pipeline/agent_playout.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame, AudioSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport { log } from '../log.js';\nimport type { TextAudioSynchronizer } from '../transcription.js';\nimport { CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { SynthesisHandle } from './agent_output.js';\n\nexport enum AgentPlayoutEvent {\n PLAYOUT_STARTED,\n PLAYOUT_STOPPED,\n}\n\nexport type AgentPlayoutCallbacks = {\n [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;\n [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;\n};\n\nexport class PlayoutHandle {\n #speechId: string;\n #audioSource: AudioSource;\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;\n totalPlayedTime?: number;\n synchronizer: TextAudioSynchronizer;\n #interrupted = false;\n pushedDuration = 0;\n intFut = new Future();\n doneFut = new Future();\n\n constructor(\n speechId: string,\n audioSource: AudioSource,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n synchronizer: TextAudioSynchronizer,\n ) {\n this.#speechId = speechId;\n this.#audioSource = audioSource;\n this.playoutSource = playoutSource;\n this.synchronizer = synchronizer;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get interrupted(): boolean {\n return this.#interrupted;\n }\n\n get timePlayed(): number {\n return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;\n }\n\n get done(): boolean {\n return this.doneFut.done || this.#interrupted;\n }\n\n interrupt() {\n if (this.done) {\n return;\n }\n\n this.intFut.resolve();\n this.#interrupted = true;\n }\n\n join(): Future {\n return this.doneFut;\n }\n}\n\nexport class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {\n #closed = false;\n #audioSource: AudioSource;\n #targetVolume = 1;\n #playoutTask?: CancellablePromise<void>;\n #logger = log();\n\n constructor(audioSource: AudioSource) {\n super();\n this.#audioSource = audioSource;\n }\n\n get targetVolume(): number {\n return this.#targetVolume;\n }\n\n set targetVolume(vol: number) {\n this.#targetVolume = vol;\n }\n\n play(\n speechId: string,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n synchronizer: TextAudioSynchronizer,\n ): PlayoutHandle {\n if (this.#closed) {\n throw new Error('source closed');\n }\n\n const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);\n\n this.#playoutTask = this.#playout(handle, this.#playoutTask);\n return handle;\n }\n\n #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const cancel = () => {\n captureTask.cancel();\n handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;\n\n if (handle.interrupted || captureTask.error) {\n handle.synchronizer.close(true);\n this.#audioSource.clearQueue(); // make sure to remove any queued frames\n }\n\n if (!firstFrame) {\n this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);\n }\n\n handle.doneFut.resolve();\n\n this.#logger\n .child({ speechId: handle.speechId, interrupted: handle.interrupted })\n .debug('playout finished');\n };\n\n onCancel(() => {\n cancel();\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n if (this.#audioSource.queuedDuration > 0) {\n // this should not happen, but log it just in case\n this.#logger\n .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })\n .warn('new playout while the source is still playing');\n }\n\n let firstFrame = true;\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n for await (const frame of handle.playoutSource) {\n if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {\n break;\n }\n if (firstFrame) {\n this.#logger\n .child({ speechId: handle.speechId })\n .debug('started playing the first time');\n this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);\n handle.synchronizer.segmentPlayoutStarted();\n firstFrame = false;\n }\n handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;\n handle.synchronizer.pushAudio(frame);\n await this.#audioSource.captureFrame(frame);\n await this.#audioSource.waitForPlayout();\n }\n\n // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,\n // but for some reason too many TTS frames can gunk up the buffer and lead to\n // FFI errors. this works 🤷‍♀️\n // if (this.#audioSource.queuedDuration > 0) {\n // await this.#audioSource.waitForPlayout();\n // }\n\n handle.synchronizer.close(false);\n resolve();\n });\n\n try {\n await Promise.any([captureTask, handle.intFut.await]);\n } finally {\n cancel();\n resolve();\n }\n });\n }\n\n async close() {\n this.#closed = true;\n await this.#playoutTask;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAAyB;AACzB,iBAAoB;AAEpB,mBAA6D;AAC7D,0BAAgC;AAEzB,IAAK,oBAAL,kBAAKA,uBAAL;AACL,EAAAA,sCAAA;AACA,EAAAA,sCAAA;AAFU,SAAAA;AAAA,GAAA;AAUL,MAAM,cAAc;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,eAAe;AAAA,EACf,iBAAiB;AAAA,EACjB,SAAS,IAAI,oBAAO;AAAA,EACpB,UAAU,IAAI,oBAAO;AAAA,EAErB,YACE,UACA,aACA,eACA,cACA;AACA,SAAK,YAAY;AACjB,SAAK,eAAe;AACpB,SAAK,gBAAgB;AACrB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK,mBAAmB,KAAK,iBAAiB,KAAK,aAAa;AAAA,EACzE;AAAA,EAEA,IAAI,OAAgB;AAClB,WAAO,KAAK,QAAQ,QAAQ,KAAK;AAAA,EACnC;AAAA,EAEA,YAAY;AACV,QAAI,KAAK,MAAM;AACb;AAAA,IACF;AAEA,SAAK,OAAO,QAAQ;AACpB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,OAAe;AACb,WAAO,KAAK;AAAA,EACd;AACF;AAEO,MAAM,qBAAsB,mBAAAC,QAA+D;AAAA,EAChG,UAAU;AAAA,EACV;AAAA,EACA,gBAAgB;AAAA,EAChB;AAAA,EACA,cAAU,gBAAI;AAAA,EAEd,YAAY,aAA0B;AACpC,UAAM;AACN,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAa,KAAa;AAC5B,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,KACE,UACA,eACA,cACe;AACf,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,eAAe;AAAA,IACjC;AAEA,UAAM,SAAS,IAAI,cAAc,UAAU,KAAK,cAAc,eAAe,YAAY;AAEzF,SAAK,eAAe,KAAK,SAAS,QAAQ,KAAK,YAAY;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,SAAS,QAAuB,SAA8D;AAC5F,WAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,SAAS,MAAM;AACnB,oBAAY,OAAO;AACnB,eAAO,kBAAkB,OAAO,iBAAiB,KAAK,aAAa;AAEnE,YAAI,OAAO,eAAe,YAAY,OAAO;AAC3C,iBAAO,aAAa,MAAM,IAAI;AAC9B,eAAK,aAAa,WAAW;AAAA,QAC/B;AAEA,YAAI,CAAC,YAAY;AACf,eAAK,KAAK,yBAAmC,OAAO,WAAW;AAAA,QACjE;AAEA,eAAO,QAAQ,QAAQ;AAEvB,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,aAAa,OAAO,YAAY,CAAC,EACpE,MAAM,kBAAkB;AAAA,MAC7B;AAEA,eAAS,MAAM;AACb,eAAO;AAAA,MACT,CAAC;AAED,UAAI,SAAS;AACX,kBAAM,+BAAiB,OAAO;AAAA,MAChC;AAEA,UAAI,KAAK,aAAa,iBAAiB,GAAG;AAExC,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,gBAAgB,KAAK,aAAa,eAAe,CAAC,EACrF,KAAK,+CAA+C;AAAA,MACzD;AAEA,UAAI,aAAa;AAGjB,YAAM,cAAc,IAAI,gCAAyB,OAAOC,UAASC,IAAGC,cAAa;AAC/E,YAAI,YAAY;AAChB,QAAAA,UAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,yBAAiB,SAAS,OAAO,eAAe;AAC9C,cAAI,aAAa,UAAU,oCAAgB,gBAAgB;AACzD;AAAA,UACF;AACA,cAAI,YAAY;AACd,iBAAK,QACF,MAAM,EAAE,UAAU,OAAO,SAAS,CAAC,EACnC,MAAM,gCAAgC;AACzC,iBAAK,KAAK,uBAAiC;AAC3C,mBAAO,aAAa,sBAAsB;AAC1C,yBAAa;AAAA,UACf;AACA,iBAAO,kBAAmB,MAAM,oBAAoB,MAAM,aAAc;AACxE,iBAAO,aAAa,UAAU,KAAK;AACnC,gBAAM,KAAK,aAAa,aAAa,KAAK;AAC1C,gBAAM,KAAK,aAAa,eAAe;AAAA,QACzC;AASA,eAAO,aAAa,MAAM,KAAK;AAC/B,QAAAF,SAAQ;AAAA,MACV,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,aAAa,OAAO,OAAO,KAAK,CAAC;AAAA,MACtD,UAAE;AACA,eAAO;AACP,gBAAQ;AAAA,MACV;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU;AACf,UAAM,KAAK;AAAA,EACb;AACF;","names":["AgentPlayoutEvent","EventEmitter","resolve","_","onCancel"]}
@@ -1,5 +1,6 @@
1
1
  import type { AudioFrame, AudioSource } from '@livekit/rtc-node';
2
2
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
3
+ import type { TextAudioSynchronizer } from '../transcription.js';
3
4
  import { Future } from '../utils.js';
4
5
  import { SynthesisHandle } from './agent_output.js';
5
6
  export declare enum AgentPlayoutEvent {
@@ -14,10 +15,11 @@ export declare class PlayoutHandle {
14
15
  #private;
15
16
  playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
16
17
  totalPlayedTime?: number;
18
+ synchronizer: TextAudioSynchronizer;
17
19
  pushedDuration: number;
18
20
  intFut: Future;
19
21
  doneFut: Future;
20
- constructor(speechId: string, audioSource: AudioSource, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>);
22
+ constructor(speechId: string, audioSource: AudioSource, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>, synchronizer: TextAudioSynchronizer);
21
23
  get speechId(): string;
22
24
  get interrupted(): boolean;
23
25
  get timePlayed(): number;
@@ -31,7 +33,7 @@ export declare class AgentPlayout extends AgentPlayout_base {
31
33
  constructor(audioSource: AudioSource);
32
34
  get targetVolume(): number;
33
35
  set targetVolume(vol: number);
34
- play(speechId: string, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>): PlayoutHandle;
36
+ play(speechId: string, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>, synchronizer: TextAudioSynchronizer): PlayoutHandle;
35
37
  close(): Promise<void>;
36
38
  }
37
39
  export {};
@@ -1 +1 @@
1
- {"version":3,"file":"agent_playout.d.ts","sourceRoot":"","sources":["../../src/pipeline/agent_playout.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACjE,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAGhF,OAAO,EAAsB,MAAM,EAAoB,MAAM,aAAa,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEpD,oBAAY,iBAAiB;IAC3B,eAAe,IAAA;IACf,eAAe,IAAA;CAChB;AAED,MAAM,MAAM,qBAAqB,GAAG;IAClC,CAAC,iBAAiB,CAAC,eAAe,CAAC,EAAE,MAAM,IAAI,CAAC;IAChD,CAAC,iBAAiB,CAAC,eAAe,CAAC,EAAE,CAAC,SAAS,EAAE,OAAO,KAAK,IAAI,CAAC;CACnE,CAAC;AAEF,qBAAa,aAAa;;IAGxB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,CAAC;IACjF,eAAe,CAAC,EAAE,MAAM,CAAC;IAEzB,cAAc,SAAK;IACnB,MAAM,SAAgB;IACtB,OAAO,SAAgB;gBAGrB,QAAQ,EAAE,MAAM,EAChB,WAAW,EAAE,WAAW,EACxB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC;IAOlF,IAAI,QAAQ,IAAI,MAAM,CAErB;IAED,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,IAAI,IAAI,IAAI,OAAO,CAElB;IAED,SAAS;IAST,IAAI,IAAI,MAAM;CAGf;2CAE4D,aAAa,qBAAqB,CAAC;AAAhG,qBAAa,YAAa,SAAQ,iBAA+D;;gBAOnF,WAAW,EAAE,WAAW;IAKpC,IAAI,YAAY,IAAI,MAAM,CAEzB;IAED,IAAI,YAAY,CAAC,GAAG,EAAE,MAAM,EAE3B;IAED,IAAI,CACF,QAAQ,EAAE,MAAM,EAChB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,GAC/E,aAAa;IA2FV,KAAK;CAIZ"}
1
+ {"version":3,"file":"agent_playout.d.ts","sourceRoot":"","sources":["../../src/pipeline/agent_playout.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACjE,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAGhF,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AACjE,OAAO,EAAsB,MAAM,EAAoB,MAAM,aAAa,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEpD,oBAAY,iBAAiB;IAC3B,eAAe,IAAA;IACf,eAAe,IAAA;CAChB;AAED,MAAM,MAAM,qBAAqB,GAAG;IAClC,CAAC,iBAAiB,CAAC,eAAe,CAAC,EAAE,MAAM,IAAI,CAAC;IAChD,CAAC,iBAAiB,CAAC,eAAe,CAAC,EAAE,CAAC,SAAS,EAAE,OAAO,KAAK,IAAI,CAAC;CACnE,CAAC;AAEF,qBAAa,aAAa;;IAGxB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,CAAC;IACjF,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,EAAE,qBAAqB,CAAC;IAEpC,cAAc,SAAK;IACnB,MAAM,SAAgB;IACtB,OAAO,SAAgB;gBAGrB,QAAQ,EAAE,MAAM,EAChB,WAAW,EAAE,WAAW,EACxB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,EAChF,YAAY,EAAE,qBAAqB;IAQrC,IAAI,QAAQ,IAAI,MAAM,CAErB;IAED,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,IAAI,IAAI,IAAI,OAAO,CAElB;IAED,SAAS;IAST,IAAI,IAAI,MAAM;CAGf;2CAE4D,aAAa,qBAAqB,CAAC;AAAhG,qBAAa,YAAa,SAAQ,iBAA+D;;gBAOnF,WAAW,EAAE,WAAW;IAKpC,IAAI,YAAY,IAAI,MAAM,CAEzB;IAED,IAAI,YAAY,CAAC,GAAG,EAAE,MAAM,EAE3B;IAED,IAAI,CACF,QAAQ,EAAE,MAAM,EAChB,aAAa,EAAE,aAAa,CAAC,UAAU,GAAG,OAAO,eAAe,CAAC,cAAc,CAAC,EAChF,YAAY,EAAE,qBAAqB,GAClC,aAAa;IA+FV,KAAK;CAIZ"}
@@ -12,14 +12,16 @@ class PlayoutHandle {
12
12
  #audioSource;
13
13
  playoutSource;
14
14
  totalPlayedTime;
15
+ synchronizer;
15
16
  #interrupted = false;
16
17
  pushedDuration = 0;
17
18
  intFut = new Future();
18
19
  doneFut = new Future();
19
- constructor(speechId, audioSource, playoutSource) {
20
+ constructor(speechId, audioSource, playoutSource, synchronizer) {
20
21
  this.#speechId = speechId;
21
22
  this.#audioSource = audioSource;
22
23
  this.playoutSource = playoutSource;
24
+ this.synchronizer = synchronizer;
23
25
  }
24
26
  get speechId() {
25
27
  return this.#speechId;
@@ -60,11 +62,11 @@ class AgentPlayout extends EventEmitter {
60
62
  set targetVolume(vol) {
61
63
  this.#targetVolume = vol;
62
64
  }
63
- play(speechId, playoutSource) {
65
+ play(speechId, playoutSource, synchronizer) {
64
66
  if (this.#closed) {
65
67
  throw new Error("source closed");
66
68
  }
67
- const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);
69
+ const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);
68
70
  this.#playoutTask = this.#playout(handle, this.#playoutTask);
69
71
  return handle;
70
72
  }
@@ -74,6 +76,7 @@ class AgentPlayout extends EventEmitter {
74
76
  captureTask.cancel();
75
77
  handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
76
78
  if (handle.interrupted || captureTask.error) {
79
+ handle.synchronizer.close(true);
77
80
  this.#audioSource.clearQueue();
78
81
  }
79
82
  if (!firstFrame) {
@@ -104,12 +107,15 @@ class AgentPlayout extends EventEmitter {
104
107
  if (firstFrame) {
105
108
  this.#logger.child({ speechId: handle.speechId }).debug("started playing the first time");
106
109
  this.emit(0 /* PLAYOUT_STARTED */);
110
+ handle.synchronizer.segmentPlayoutStarted();
107
111
  firstFrame = false;
108
112
  }
109
113
  handle.pushedDuration += frame.samplesPerChannel / frame.sampleRate * 1e3;
114
+ handle.synchronizer.pushAudio(frame);
110
115
  await this.#audioSource.captureFrame(frame);
111
116
  await this.#audioSource.waitForPlayout();
112
117
  }
118
+ handle.synchronizer.close(false);
113
119
  resolve2();
114
120
  });
115
121
  try {
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipeline/agent_playout.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame, AudioSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport { log } from '../log.js';\nimport { CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { SynthesisHandle } from './agent_output.js';\n\nexport enum AgentPlayoutEvent {\n PLAYOUT_STARTED,\n PLAYOUT_STOPPED,\n}\n\nexport type AgentPlayoutCallbacks = {\n [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;\n [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;\n};\n\nexport class PlayoutHandle {\n #speechId: string;\n #audioSource: AudioSource;\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;\n totalPlayedTime?: number;\n #interrupted = false;\n pushedDuration = 0;\n intFut = new Future();\n doneFut = new Future();\n\n constructor(\n speechId: string,\n audioSource: AudioSource,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n ) {\n this.#speechId = speechId;\n this.#audioSource = audioSource;\n this.playoutSource = playoutSource;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get interrupted(): boolean {\n return this.#interrupted;\n }\n\n get timePlayed(): number {\n return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;\n }\n\n get done(): boolean {\n return this.doneFut.done || this.#interrupted;\n }\n\n interrupt() {\n if (this.done) {\n return;\n }\n\n this.intFut.resolve();\n this.#interrupted = true;\n }\n\n join(): Future {\n return this.doneFut;\n }\n}\n\nexport class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {\n #closed = false;\n #audioSource: AudioSource;\n #targetVolume = 1;\n #playoutTask?: CancellablePromise<void>;\n #logger = log();\n\n constructor(audioSource: AudioSource) {\n super();\n this.#audioSource = audioSource;\n }\n\n get targetVolume(): number {\n return this.#targetVolume;\n }\n\n set targetVolume(vol: number) {\n this.#targetVolume = vol;\n }\n\n play(\n speechId: string,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n ): PlayoutHandle {\n if (this.#closed) {\n throw new Error('source closed');\n }\n\n const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);\n\n this.#playoutTask = this.#playout(handle, this.#playoutTask);\n return handle;\n }\n\n #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const cancel = () => {\n captureTask.cancel();\n handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;\n\n if (handle.interrupted || captureTask.error) {\n this.#audioSource.clearQueue(); // make sure to remove any queued frames\n }\n\n if (!firstFrame) {\n this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);\n }\n\n handle.doneFut.resolve();\n\n this.#logger\n .child({ speechId: handle.speechId, interrupted: handle.interrupted })\n .debug('playout finished');\n };\n\n onCancel(() => {\n cancel();\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n if (this.#audioSource.queuedDuration > 0) {\n // this should not happen, but log it just in case\n this.#logger\n .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })\n .warn('new playout while the source is still playing');\n }\n\n let firstFrame = true;\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n for await (const frame of handle.playoutSource) {\n if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {\n break;\n }\n if (firstFrame) {\n this.#logger\n .child({ speechId: handle.speechId })\n .debug('started playing the first time');\n this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);\n firstFrame = false;\n }\n handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;\n await this.#audioSource.captureFrame(frame);\n await this.#audioSource.waitForPlayout();\n }\n\n // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,\n // but for some reason too many TTS frames can gunk up the buffer and lead to\n // FFI errors. this works 🤷‍♀️\n // if (this.#audioSource.queuedDuration > 0) {\n // await this.#audioSource.waitForPlayout();\n // }\n\n resolve();\n });\n\n try {\n await Promise.any([captureTask, handle.intFut.await]);\n } finally {\n cancel();\n resolve();\n }\n });\n }\n\n async close() {\n this.#closed = true;\n await this.#playoutTask;\n }\n}\n"],"mappings":"AAKA,OAAO,kBAAkB;AACzB,SAAS,WAAW;AACpB,SAAS,oBAAoB,QAAQ,wBAAwB;AAC7D,SAAS,uBAAuB;AAEzB,IAAK,oBAAL,kBAAKA,uBAAL;AACL,EAAAA,sCAAA;AACA,EAAAA,sCAAA;AAFU,SAAAA;AAAA,GAAA;AAUL,MAAM,cAAc;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,eAAe;AAAA,EACf,iBAAiB;AAAA,EACjB,SAAS,IAAI,OAAO;AAAA,EACpB,UAAU,IAAI,OAAO;AAAA,EAErB,YACE,UACA,aACA,eACA;AACA,SAAK,YAAY;AACjB,SAAK,eAAe;AACpB,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK,mBAAmB,KAAK,iBAAiB,KAAK,aAAa;AAAA,EACzE;AAAA,EAEA,IAAI,OAAgB;AAClB,WAAO,KAAK,QAAQ,QAAQ,KAAK;AAAA,EACnC;AAAA,EAEA,YAAY;AACV,QAAI,KAAK,MAAM;AACb;AAAA,IACF;AAEA,SAAK,OAAO,QAAQ;AACpB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,OAAe;AACb,WAAO,KAAK;AAAA,EACd;AACF;AAEO,MAAM,qBAAsB,aAA+D;AAAA,EAChG,UAAU;AAAA,EACV;AAAA,EACA,gBAAgB;AAAA,EAChB;AAAA,EACA,UAAU,IAAI;AAAA,EAEd,YAAY,aAA0B;AACpC,UAAM;AACN,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAa,KAAa;AAC5B,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,KACE,UACA,eACe;AACf,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,eAAe;AAAA,IACjC;AAEA,UAAM,SAAS,IAAI,cAAc,UAAU,KAAK,cAAc,aAAa;AAE3E,SAAK,eAAe,KAAK,SAAS,QAAQ,KAAK,YAAY;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,SAAS,QAAuB,SAA8D;AAC5F,WAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,SAAS,MAAM;AACnB,oBAAY,OAAO;AACnB,eAAO,kBAAkB,OAAO,iBAAiB,KAAK,aAAa;AAEnE,YAAI,OAAO,eAAe,YAAY,OAAO;AAC3C,eAAK,aAAa,WAAW;AAAA,QAC/B;AAEA,YAAI,CAAC,YAAY;AACf,eAAK,KAAK,yBAAmC,OAAO,WAAW;AAAA,QACjE;AAEA,eAAO,QAAQ,QAAQ;AAEvB,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,aAAa,OAAO,YAAY,CAAC,EACpE,MAAM,kBAAkB;AAAA,MAC7B;AAEA,eAAS,MAAM;AACb,eAAO;AAAA,MACT,CAAC;AAED,UAAI,SAAS;AACX,cAAM,iBAAiB,OAAO;AAAA,MAChC;AAEA,UAAI,KAAK,aAAa,iBAAiB,GAAG;AAExC,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,gBAAgB,KAAK,aAAa,eAAe,CAAC,EACrF,KAAK,+CAA+C;AAAA,MACzD;AAEA,UAAI,aAAa;AAGjB,YAAM,cAAc,IAAI,mBAAyB,OAAOC,UAASC,IAAGC,cAAa;AAC/E,YAAI,YAAY;AAChB,QAAAA,UAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,yBAAiB,SAAS,OAAO,eAAe;AAC9C,cAAI,aAAa,UAAU,gBAAgB,gBAAgB;AACzD;AAAA,UACF;AACA,cAAI,YAAY;AACd,iBAAK,QACF,MAAM,EAAE,UAAU,OAAO,SAAS,CAAC,EACnC,MAAM,gCAAgC;AACzC,iBAAK,KAAK,uBAAiC;AAC3C,yBAAa;AAAA,UACf;AACA,iBAAO,kBAAmB,MAAM,oBAAoB,MAAM,aAAc;AACxE,gBAAM,KAAK,aAAa,aAAa,KAAK;AAC1C,gBAAM,KAAK,aAAa,eAAe;AAAA,QACzC;AASA,QAAAF,SAAQ;AAAA,MACV,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,aAAa,OAAO,OAAO,KAAK,CAAC;AAAA,MACtD,UAAE;AACA,eAAO;AACP,gBAAQ;AAAA,MACV;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU;AACf,UAAM,KAAK;AAAA,EACb;AACF;","names":["AgentPlayoutEvent","resolve","_","onCancel"]}
1
+ {"version":3,"sources":["../../src/pipeline/agent_playout.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame, AudioSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport { log } from '../log.js';\nimport type { TextAudioSynchronizer } from '../transcription.js';\nimport { CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { SynthesisHandle } from './agent_output.js';\n\nexport enum AgentPlayoutEvent {\n PLAYOUT_STARTED,\n PLAYOUT_STOPPED,\n}\n\nexport type AgentPlayoutCallbacks = {\n [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;\n [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;\n};\n\nexport class PlayoutHandle {\n #speechId: string;\n #audioSource: AudioSource;\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;\n totalPlayedTime?: number;\n synchronizer: TextAudioSynchronizer;\n #interrupted = false;\n pushedDuration = 0;\n intFut = new Future();\n doneFut = new Future();\n\n constructor(\n speechId: string,\n audioSource: AudioSource,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n synchronizer: TextAudioSynchronizer,\n ) {\n this.#speechId = speechId;\n this.#audioSource = audioSource;\n this.playoutSource = playoutSource;\n this.synchronizer = synchronizer;\n }\n\n get speechId(): string {\n return this.#speechId;\n }\n\n get interrupted(): boolean {\n return this.#interrupted;\n }\n\n get timePlayed(): number {\n return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;\n }\n\n get done(): boolean {\n return this.doneFut.done || this.#interrupted;\n }\n\n interrupt() {\n if (this.done) {\n return;\n }\n\n this.intFut.resolve();\n this.#interrupted = true;\n }\n\n join(): Future {\n return this.doneFut;\n }\n}\n\nexport class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {\n #closed = false;\n #audioSource: AudioSource;\n #targetVolume = 1;\n #playoutTask?: CancellablePromise<void>;\n #logger = log();\n\n constructor(audioSource: AudioSource) {\n super();\n this.#audioSource = audioSource;\n }\n\n get targetVolume(): number {\n return this.#targetVolume;\n }\n\n set targetVolume(vol: number) {\n this.#targetVolume = vol;\n }\n\n play(\n speechId: string,\n playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,\n synchronizer: TextAudioSynchronizer,\n ): PlayoutHandle {\n if (this.#closed) {\n throw new Error('source closed');\n }\n\n const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);\n\n this.#playoutTask = this.#playout(handle, this.#playoutTask);\n return handle;\n }\n\n #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n const cancel = () => {\n captureTask.cancel();\n handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;\n\n if (handle.interrupted || captureTask.error) {\n handle.synchronizer.close(true);\n this.#audioSource.clearQueue(); // make sure to remove any queued frames\n }\n\n if (!firstFrame) {\n this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);\n }\n\n handle.doneFut.resolve();\n\n this.#logger\n .child({ speechId: handle.speechId, interrupted: handle.interrupted })\n .debug('playout finished');\n };\n\n onCancel(() => {\n cancel();\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n if (this.#audioSource.queuedDuration > 0) {\n // this should not happen, but log it just in case\n this.#logger\n .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })\n .warn('new playout while the source is still playing');\n }\n\n let firstFrame = true;\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n for await (const frame of handle.playoutSource) {\n if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {\n break;\n }\n if (firstFrame) {\n this.#logger\n .child({ speechId: handle.speechId })\n .debug('started playing the first time');\n this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);\n handle.synchronizer.segmentPlayoutStarted();\n firstFrame = false;\n }\n handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;\n handle.synchronizer.pushAudio(frame);\n await this.#audioSource.captureFrame(frame);\n await this.#audioSource.waitForPlayout();\n }\n\n // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,\n // but for some reason too many TTS frames can gunk up the buffer and lead to\n // FFI errors. this works 🤷‍♀️\n // if (this.#audioSource.queuedDuration > 0) {\n // await this.#audioSource.waitForPlayout();\n // }\n\n handle.synchronizer.close(false);\n resolve();\n });\n\n try {\n await Promise.any([captureTask, handle.intFut.await]);\n } finally {\n cancel();\n resolve();\n }\n });\n }\n\n async close() {\n this.#closed = true;\n await this.#playoutTask;\n }\n}\n"],"mappings":"AAKA,OAAO,kBAAkB;AACzB,SAAS,WAAW;AAEpB,SAAS,oBAAoB,QAAQ,wBAAwB;AAC7D,SAAS,uBAAuB;AAEzB,IAAK,oBAAL,kBAAKA,uBAAL;AACL,EAAAA,sCAAA;AACA,EAAAA,sCAAA;AAFU,SAAAA;AAAA,GAAA;AAUL,MAAM,cAAc;AAAA,EACzB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,eAAe;AAAA,EACf,iBAAiB;AAAA,EACjB,SAAS,IAAI,OAAO;AAAA,EACpB,UAAU,IAAI,OAAO;AAAA,EAErB,YACE,UACA,aACA,eACA,cACA;AACA,SAAK,YAAY;AACjB,SAAK,eAAe;AACpB,SAAK,gBAAgB;AACrB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,WAAmB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK,mBAAmB,KAAK,iBAAiB,KAAK,aAAa;AAAA,EACzE;AAAA,EAEA,IAAI,OAAgB;AAClB,WAAO,KAAK,QAAQ,QAAQ,KAAK;AAAA,EACnC;AAAA,EAEA,YAAY;AACV,QAAI,KAAK,MAAM;AACb;AAAA,IACF;AAEA,SAAK,OAAO,QAAQ;AACpB,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,OAAe;AACb,WAAO,KAAK;AAAA,EACd;AACF;AAEO,MAAM,qBAAsB,aAA+D;AAAA,EAChG,UAAU;AAAA,EACV;AAAA,EACA,gBAAgB;AAAA,EAChB;AAAA,EACA,UAAU,IAAI;AAAA,EAEd,YAAY,aAA0B;AACpC,UAAM;AACN,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,IAAI,eAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,aAAa,KAAa;AAC5B,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,KACE,UACA,eACA,cACe;AACf,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,eAAe;AAAA,IACjC;AAEA,UAAM,SAAS,IAAI,cAAc,UAAU,KAAK,cAAc,eAAe,YAAY;AAEzF,SAAK,eAAe,KAAK,SAAS,QAAQ,KAAK,YAAY;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,SAAS,QAAuB,SAA8D;AAC5F,WAAO,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,YAAM,SAAS,MAAM;AACnB,oBAAY,OAAO;AACnB,eAAO,kBAAkB,OAAO,iBAAiB,KAAK,aAAa;AAEnE,YAAI,OAAO,eAAe,YAAY,OAAO;AAC3C,iBAAO,aAAa,MAAM,IAAI;AAC9B,eAAK,aAAa,WAAW;AAAA,QAC/B;AAEA,YAAI,CAAC,YAAY;AACf,eAAK,KAAK,yBAAmC,OAAO,WAAW;AAAA,QACjE;AAEA,eAAO,QAAQ,QAAQ;AAEvB,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,aAAa,OAAO,YAAY,CAAC,EACpE,MAAM,kBAAkB;AAAA,MAC7B;AAEA,eAAS,MAAM;AACb,eAAO;AAAA,MACT,CAAC;AAED,UAAI,SAAS;AACX,cAAM,iBAAiB,OAAO;AAAA,MAChC;AAEA,UAAI,KAAK,aAAa,iBAAiB,GAAG;AAExC,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,UAAU,gBAAgB,KAAK,aAAa,eAAe,CAAC,EACrF,KAAK,+CAA+C;AAAA,MACzD;AAEA,UAAI,aAAa;AAGjB,YAAM,cAAc,IAAI,mBAAyB,OAAOC,UAASC,IAAGC,cAAa;AAC/E,YAAI,YAAY;AAChB,QAAAA,UAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,yBAAiB,SAAS,OAAO,eAAe;AAC9C,cAAI,aAAa,UAAU,gBAAgB,gBAAgB;AACzD;AAAA,UACF;AACA,cAAI,YAAY;AACd,iBAAK,QACF,MAAM,EAAE,UAAU,OAAO,SAAS,CAAC,EACnC,MAAM,gCAAgC;AACzC,iBAAK,KAAK,uBAAiC;AAC3C,mBAAO,aAAa,sBAAsB;AAC1C,yBAAa;AAAA,UACf;AACA,iBAAO,kBAAmB,MAAM,oBAAoB,MAAM,aAAc;AACxE,iBAAO,aAAa,UAAU,KAAK;AACnC,gBAAM,KAAK,aAAa,aAAa,KAAK;AAC1C,gBAAM,KAAK,aAAa,eAAe;AAAA,QACzC;AASA,eAAO,aAAa,MAAM,KAAK;AAC/B,QAAAF,SAAQ;AAAA,MACV,CAAC;AAED,UAAI;AACF,cAAM,QAAQ,IAAI,CAAC,aAAa,OAAO,OAAO,KAAK,CAAC;AAAA,MACtD,UAAE;AACA,eAAO;AACP,gBAAQ;AAAA,MACV;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,QAAQ;AACZ,SAAK,UAAU;AACf,UAAM,KAAK;AAAA,EACb;AACF;","names":["AgentPlayoutEvent","resolve","_","onCancel"]}
@@ -57,6 +57,12 @@ class HumanInput extends import_node_events.EventEmitter {
57
57
  this.#room.on(import_rtc_node.RoomEvent.TrackSubscribed, this.#subscribeToMicrophone.bind(this));
58
58
  this.#subscribeToMicrophone();
59
59
  }
60
+ get participant() {
61
+ return this.#participant;
62
+ }
63
+ get subscribedTrack() {
64
+ return this.#subscribedTrack;
65
+ }
60
66
  #subscribeToMicrophone() {
61
67
  if (!this.#participant) {
62
68
  this.#logger.error("Participant is not set");
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipeline/human_input.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type {\n RemoteAudioTrack,\n RemoteParticipant,\n RemoteTrackPublication,\n Room,\n} from '@livekit/rtc-node';\nimport { AudioStream, RoomEvent, TrackSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport { log } from '../log.js';\nimport type { STT, SpeechEvent } from '../stt/stt.js';\nimport { SpeechEventType } from '../stt/stt.js';\nimport { CancellablePromise, gracefullyCancel } from '../utils.js';\nimport type { VAD, VADEvent } from '../vad.js';\nimport { VADEventType } from '../vad.js';\n\nexport enum HumanInputEvent {\n START_OF_SPEECH,\n VAD_INFERENCE_DONE,\n END_OF_SPEECH,\n FINAL_TRANSCRIPT,\n INTERIM_TRANSCRIPT,\n}\n\nexport type HumanInputCallbacks = {\n [HumanInputEvent.START_OF_SPEECH]: (event: VADEvent) => void;\n [HumanInputEvent.VAD_INFERENCE_DONE]: (event: VADEvent) => void;\n [HumanInputEvent.END_OF_SPEECH]: (event: VADEvent) => void;\n [HumanInputEvent.FINAL_TRANSCRIPT]: (event: SpeechEvent) => void;\n [HumanInputEvent.INTERIM_TRANSCRIPT]: (event: SpeechEvent) => void;\n};\n\nexport class HumanInput extends (EventEmitter as new () => TypedEmitter<HumanInputCallbacks>) {\n #closed = false;\n #room: Room;\n #vad: VAD;\n #stt: STT;\n #participant: RemoteParticipant;\n #subscribedTrack?: RemoteAudioTrack;\n #recognizeTask?: CancellablePromise<void>;\n #speaking = false;\n #speechProbability = 0;\n #logger = log();\n\n constructor(room: Room, vad: VAD, stt: STT, participant: RemoteParticipant) {\n super();\n this.#room = room;\n this.#vad = vad;\n this.#stt = stt;\n this.#participant = participant;\n\n this.#room.on(RoomEvent.TrackPublished, this.#subscribeToMicrophone.bind(this));\n this.#room.on(RoomEvent.TrackSubscribed, this.#subscribeToMicrophone.bind(this));\n this.#subscribeToMicrophone();\n }\n\n #subscribeToMicrophone(): void {\n if (!this.#participant) {\n this.#logger.error('Participant is not set');\n return;\n }\n\n let microphonePublication: RemoteTrackPublication | undefined = undefined;\n for (const publication of this.#participant.trackPublications.values()) {\n if (publication.source === TrackSource.SOURCE_MICROPHONE) {\n microphonePublication = publication;\n break;\n }\n }\n if (!microphonePublication) {\n return;\n }\n\n if (!microphonePublication.subscribed) {\n microphonePublication.setSubscribed(true);\n }\n\n const track = microphonePublication.track;\n if (track && track !== this.#subscribedTrack) {\n this.#subscribedTrack = track;\n if (this.#recognizeTask) {\n this.#recognizeTask.cancel();\n }\n\n const audioStream = new AudioStream(track, 16000);\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n this.#recognizeTask = new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const sttStream = this.#stt.stream();\n const vadStream = this.#vad.stream();\n\n const audioStreamCo = async () => {\n for await (const ev of audioStream) {\n if (cancelled) return;\n sttStream.pushFrame(ev);\n vadStream.pushFrame(ev);\n }\n };\n\n const vadStreamCo = async () => {\n for await (const ev of vadStream) {\n if (cancelled) return;\n switch (ev.type) {\n case VADEventType.START_OF_SPEECH:\n this.#speaking = true;\n this.emit(HumanInputEvent.START_OF_SPEECH, ev);\n break;\n case VADEventType.INFERENCE_DONE:\n this.#speechProbability = ev.probability;\n this.emit(HumanInputEvent.VAD_INFERENCE_DONE, ev);\n break;\n case VADEventType.END_OF_SPEECH:\n this.#speaking = false;\n this.emit(HumanInputEvent.END_OF_SPEECH, ev);\n break;\n }\n }\n };\n\n const sttStreamCo = async () => {\n for await (const ev of sttStream) {\n if (cancelled) return;\n if (ev.type === SpeechEventType.FINAL_TRANSCRIPT) {\n this.emit(HumanInputEvent.FINAL_TRANSCRIPT, ev);\n } else if (ev.type == SpeechEventType.INTERIM_TRANSCRIPT) {\n this.emit(HumanInputEvent.INTERIM_TRANSCRIPT, ev);\n }\n }\n };\n\n await Promise.all([audioStreamCo(), vadStreamCo(), sttStreamCo()]);\n sttStream.close();\n vadStream.close();\n resolve();\n });\n }\n }\n\n get speaking(): boolean {\n return this.#speaking;\n }\n\n get speakingProbability(): number {\n return this.#speechProbability;\n }\n\n async close() {\n if (this.#closed) {\n throw new Error('HumanInput already closed');\n }\n this.#closed = true;\n this.#room.removeAllListeners();\n this.#speaking = false;\n if (this.#recognizeTask) {\n await gracefullyCancel(this.#recognizeTask);\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AASA,sBAAoD;AAEpD,yBAA6B;AAC7B,iBAAoB;AAEpB,iBAAgC;AAChC,mBAAqD;AAErD,iBAA6B;AAEtB,IAAK,kBAAL,kBAAKA,qBAAL;AACL,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AALU,SAAAA;AAAA,GAAA;AAgBL,MAAM,mBAAoB,gCAA6D;AAAA,EAC5F,UAAU;AAAA,EACV;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA,EACZ,qBAAqB;AAAA,EACrB,cAAU,gBAAI;AAAA,EAEd,YAAY,MAAY,KAAU,KAAU,aAAgC;AAC1E,UAAM;AACN,SAAK,QAAQ;AACb,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,eAAe;AAEpB,SAAK,MAAM,GAAG,0BAAU,gBAAgB,KAAK,uBAAuB,KAAK,IAAI,CAAC;AAC9E,SAAK,MAAM,GAAG,0BAAU,iBAAiB,KAAK,uBAAuB,KAAK,IAAI,CAAC;AAC/E,SAAK,uBAAuB;AAAA,EAC9B;AAAA,EAEA,yBAA+B;AAC7B,QAAI,CAAC,KAAK,cAAc;AACtB,WAAK,QAAQ,MAAM,wBAAwB;AAC3C;AAAA,IACF;AAEA,QAAI,wBAA4D;AAChE,eAAW,eAAe,KAAK,aAAa,kBAAkB,OAAO,GAAG;AACtE,UAAI,YAAY,WAAW,4BAAY,mBAAmB;AACxD,gCAAwB;AACxB;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,uBAAuB;AAC1B;AAAA,IACF;AAEA,QAAI,CAAC,sBAAsB,YAAY;AACrC,4BAAsB,cAAc,IAAI;AAAA,IAC1C;AAEA,UAAM,QAAQ,sBAAsB;AACpC,QAAI,SAAS,UAAU,KAAK,kBAAkB;AAC5C,WAAK,mBAAmB;AACxB,UAAI,KAAK,gBAAgB;AACvB,aAAK,eAAe,OAAO;AAAA,MAC7B;AAEA,YAAM,cAAc,IAAI,4BAAY,OAAO,IAAK;AAGhD,WAAK,iBAAiB,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC3E,YAAI,YAAY;AAChB,iBAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,cAAM,YAAY,KAAK,KAAK,OAAO;AACnC,cAAM,YAAY,KAAK,KAAK,OAAO;AAEnC,cAAM,gBAAgB,YAAY;AAChC,2BAAiB,MAAM,aAAa;AAClC,gBAAI,UAAW;AACf,sBAAU,UAAU,EAAE;AACtB,sBAAU,UAAU,EAAE;AAAA,UACxB;AAAA,QACF;AAEA,cAAM,cAAc,YAAY;AAC9B,2BAAiB,MAAM,WAAW;AAChC,gBAAI,UAAW;AACf,oBAAQ,GAAG,MAAM;AAAA,cACf,KAAK,wBAAa;AAChB,qBAAK,YAAY;AACjB,qBAAK,KAAK,yBAAiC,EAAE;AAC7C;AAAA,cACF,KAAK,wBAAa;AAChB,qBAAK,qBAAqB,GAAG;AAC7B,qBAAK,KAAK,4BAAoC,EAAE;AAChD;AAAA,cACF,KAAK,wBAAa;AAChB,qBAAK,YAAY;AACjB,qBAAK,KAAK,uBAA+B,EAAE;AAC3C;AAAA,YACJ;AAAA,UACF;AAAA,QACF;AAEA,cAAM,cAAc,YAAY;AAC9B,2BAAiB,MAAM,WAAW;AAChC,gBAAI,UAAW;AACf,gBAAI,GAAG,SAAS,2BAAgB,kBAAkB;AAChD,mBAAK,KAAK,0BAAkC,EAAE;AAAA,YAChD,WAAW,GAAG,QAAQ,2BAAgB,oBAAoB;AACxD,mBAAK,KAAK,4BAAoC,EAAE;AAAA,YAClD;AAAA,UACF;AAAA,QACF;AAEA,cAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,YAAY,GAAG,YAAY,CAAC,CAAC;AACjE,kBAAU,MAAM;AAChB,kBAAU,MAAM;AAChB,gBAAQ;AAAA,MACV,CAAC;AAAA,IACH;AAAA,EACF;AAAA,EAEA,IAAI,WAAoB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,sBAA8B;AAChC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AACA,SAAK,UAAU;AACf,SAAK,MAAM,mBAAmB;AAC9B,SAAK,YAAY;AACjB,QAAI,KAAK,gBAAgB;AACvB,gBAAM,+BAAiB,KAAK,cAAc;AAAA,IAC5C;AAAA,EACF;AACF;","names":["HumanInputEvent"]}
1
+ {"version":3,"sources":["../../src/pipeline/human_input.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type {\n RemoteAudioTrack,\n RemoteParticipant,\n RemoteTrackPublication,\n Room,\n} from '@livekit/rtc-node';\nimport { AudioStream, RoomEvent, TrackSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport { log } from '../log.js';\nimport type { STT, SpeechEvent } from '../stt/stt.js';\nimport { SpeechEventType } from '../stt/stt.js';\nimport { CancellablePromise, gracefullyCancel } from '../utils.js';\nimport type { VAD, VADEvent } from '../vad.js';\nimport { VADEventType } from '../vad.js';\n\nexport enum HumanInputEvent {\n START_OF_SPEECH,\n VAD_INFERENCE_DONE,\n END_OF_SPEECH,\n FINAL_TRANSCRIPT,\n INTERIM_TRANSCRIPT,\n}\n\nexport type HumanInputCallbacks = {\n [HumanInputEvent.START_OF_SPEECH]: (event: VADEvent) => void;\n [HumanInputEvent.VAD_INFERENCE_DONE]: (event: VADEvent) => void;\n [HumanInputEvent.END_OF_SPEECH]: (event: VADEvent) => void;\n [HumanInputEvent.FINAL_TRANSCRIPT]: (event: SpeechEvent) => void;\n [HumanInputEvent.INTERIM_TRANSCRIPT]: (event: SpeechEvent) => void;\n};\n\nexport class HumanInput extends (EventEmitter as new () => TypedEmitter<HumanInputCallbacks>) {\n #closed = false;\n #room: Room;\n #vad: VAD;\n #stt: STT;\n #participant: RemoteParticipant;\n #subscribedTrack?: RemoteAudioTrack;\n #recognizeTask?: CancellablePromise<void>;\n #speaking = false;\n #speechProbability = 0;\n #logger = log();\n\n constructor(room: Room, vad: VAD, stt: STT, participant: RemoteParticipant) {\n super();\n this.#room = room;\n this.#vad = vad;\n this.#stt = stt;\n this.#participant = participant;\n\n this.#room.on(RoomEvent.TrackPublished, this.#subscribeToMicrophone.bind(this));\n this.#room.on(RoomEvent.TrackSubscribed, this.#subscribeToMicrophone.bind(this));\n this.#subscribeToMicrophone();\n }\n\n get participant(): RemoteParticipant {\n return this.#participant;\n }\n\n get subscribedTrack(): RemoteAudioTrack | undefined {\n return this.#subscribedTrack;\n }\n\n #subscribeToMicrophone(): void {\n if (!this.#participant) {\n this.#logger.error('Participant is not set');\n return;\n }\n\n let microphonePublication: RemoteTrackPublication | undefined = undefined;\n for (const publication of this.#participant.trackPublications.values()) {\n if (publication.source === TrackSource.SOURCE_MICROPHONE) {\n microphonePublication = publication;\n break;\n }\n }\n if (!microphonePublication) {\n return;\n }\n\n if (!microphonePublication.subscribed) {\n microphonePublication.setSubscribed(true);\n }\n\n const track = microphonePublication.track;\n if (track && track !== this.#subscribedTrack) {\n this.#subscribedTrack = track;\n if (this.#recognizeTask) {\n this.#recognizeTask.cancel();\n }\n\n const audioStream = new AudioStream(track, 16000);\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n this.#recognizeTask = new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const sttStream = this.#stt.stream();\n const vadStream = this.#vad.stream();\n\n const audioStreamCo = async () => {\n for await (const ev of audioStream) {\n if (cancelled) return;\n sttStream.pushFrame(ev);\n vadStream.pushFrame(ev);\n }\n };\n\n const vadStreamCo = async () => {\n for await (const ev of vadStream) {\n if (cancelled) return;\n switch (ev.type) {\n case VADEventType.START_OF_SPEECH:\n this.#speaking = true;\n this.emit(HumanInputEvent.START_OF_SPEECH, ev);\n break;\n case VADEventType.INFERENCE_DONE:\n this.#speechProbability = ev.probability;\n this.emit(HumanInputEvent.VAD_INFERENCE_DONE, ev);\n break;\n case VADEventType.END_OF_SPEECH:\n this.#speaking = false;\n this.emit(HumanInputEvent.END_OF_SPEECH, ev);\n break;\n }\n }\n };\n\n const sttStreamCo = async () => {\n for await (const ev of sttStream) {\n if (cancelled) return;\n if (ev.type === SpeechEventType.FINAL_TRANSCRIPT) {\n this.emit(HumanInputEvent.FINAL_TRANSCRIPT, ev);\n } else if (ev.type == SpeechEventType.INTERIM_TRANSCRIPT) {\n this.emit(HumanInputEvent.INTERIM_TRANSCRIPT, ev);\n }\n }\n };\n\n await Promise.all([audioStreamCo(), vadStreamCo(), sttStreamCo()]);\n sttStream.close();\n vadStream.close();\n resolve();\n });\n }\n }\n\n get speaking(): boolean {\n return this.#speaking;\n }\n\n get speakingProbability(): number {\n return this.#speechProbability;\n }\n\n async close() {\n if (this.#closed) {\n throw new Error('HumanInput already closed');\n }\n this.#closed = true;\n this.#room.removeAllListeners();\n this.#speaking = false;\n if (this.#recognizeTask) {\n await gracefullyCancel(this.#recognizeTask);\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AASA,sBAAoD;AAEpD,yBAA6B;AAC7B,iBAAoB;AAEpB,iBAAgC;AAChC,mBAAqD;AAErD,iBAA6B;AAEtB,IAAK,kBAAL,kBAAKA,qBAAL;AACL,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AALU,SAAAA;AAAA,GAAA;AAgBL,MAAM,mBAAoB,gCAA6D;AAAA,EAC5F,UAAU;AAAA,EACV;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA,EACZ,qBAAqB;AAAA,EACrB,cAAU,gBAAI;AAAA,EAEd,YAAY,MAAY,KAAU,KAAU,aAAgC;AAC1E,UAAM;AACN,SAAK,QAAQ;AACb,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,eAAe;AAEpB,SAAK,MAAM,GAAG,0BAAU,gBAAgB,KAAK,uBAAuB,KAAK,IAAI,CAAC;AAC9E,SAAK,MAAM,GAAG,0BAAU,iBAAiB,KAAK,uBAAuB,KAAK,IAAI,CAAC;AAC/E,SAAK,uBAAuB;AAAA,EAC9B;AAAA,EAEA,IAAI,cAAiC;AACnC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,kBAAgD;AAClD,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,yBAA+B;AAC7B,QAAI,CAAC,KAAK,cAAc;AACtB,WAAK,QAAQ,MAAM,wBAAwB;AAC3C;AAAA,IACF;AAEA,QAAI,wBAA4D;AAChE,eAAW,eAAe,KAAK,aAAa,kBAAkB,OAAO,GAAG;AACtE,UAAI,YAAY,WAAW,4BAAY,mBAAmB;AACxD,gCAAwB;AACxB;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,uBAAuB;AAC1B;AAAA,IACF;AAEA,QAAI,CAAC,sBAAsB,YAAY;AACrC,4BAAsB,cAAc,IAAI;AAAA,IAC1C;AAEA,UAAM,QAAQ,sBAAsB;AACpC,QAAI,SAAS,UAAU,KAAK,kBAAkB;AAC5C,WAAK,mBAAmB;AACxB,UAAI,KAAK,gBAAgB;AACvB,aAAK,eAAe,OAAO;AAAA,MAC7B;AAEA,YAAM,cAAc,IAAI,4BAAY,OAAO,IAAK;AAGhD,WAAK,iBAAiB,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC3E,YAAI,YAAY;AAChB,iBAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,cAAM,YAAY,KAAK,KAAK,OAAO;AACnC,cAAM,YAAY,KAAK,KAAK,OAAO;AAEnC,cAAM,gBAAgB,YAAY;AAChC,2BAAiB,MAAM,aAAa;AAClC,gBAAI,UAAW;AACf,sBAAU,UAAU,EAAE;AACtB,sBAAU,UAAU,EAAE;AAAA,UACxB;AAAA,QACF;AAEA,cAAM,cAAc,YAAY;AAC9B,2BAAiB,MAAM,WAAW;AAChC,gBAAI,UAAW;AACf,oBAAQ,GAAG,MAAM;AAAA,cACf,KAAK,wBAAa;AAChB,qBAAK,YAAY;AACjB,qBAAK,KAAK,yBAAiC,EAAE;AAC7C;AAAA,cACF,KAAK,wBAAa;AAChB,qBAAK,qBAAqB,GAAG;AAC7B,qBAAK,KAAK,4BAAoC,EAAE;AAChD;AAAA,cACF,KAAK,wBAAa;AAChB,qBAAK,YAAY;AACjB,qBAAK,KAAK,uBAA+B,EAAE;AAC3C;AAAA,YACJ;AAAA,UACF;AAAA,QACF;AAEA,cAAM,cAAc,YAAY;AAC9B,2BAAiB,MAAM,WAAW;AAChC,gBAAI,UAAW;AACf,gBAAI,GAAG,SAAS,2BAAgB,kBAAkB;AAChD,mBAAK,KAAK,0BAAkC,EAAE;AAAA,YAChD,WAAW,GAAG,QAAQ,2BAAgB,oBAAoB;AACxD,mBAAK,KAAK,4BAAoC,EAAE;AAAA,YAClD;AAAA,UACF;AAAA,QACF;AAEA,cAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,YAAY,GAAG,YAAY,CAAC,CAAC;AACjE,kBAAU,MAAM;AAChB,kBAAU,MAAM;AAChB,gBAAQ;AAAA,MACV,CAAC;AAAA,IACH;AAAA,EACF;AAAA,EAEA,IAAI,WAAoB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,sBAA8B;AAChC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AACA,SAAK,UAAU;AACf,SAAK,MAAM,mBAAmB;AAC9B,SAAK,YAAY;AACjB,QAAI,KAAK,gBAAgB;AACvB,gBAAM,+BAAiB,KAAK,cAAc;AAAA,IAC5C;AAAA,EACF;AACF;","names":["HumanInputEvent"]}
@@ -1,4 +1,4 @@
1
- import type { RemoteParticipant, Room } from '@livekit/rtc-node';
1
+ import type { RemoteAudioTrack, RemoteParticipant, Room } from '@livekit/rtc-node';
2
2
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
3
3
  import type { STT, SpeechEvent } from '../stt/stt.js';
4
4
  import type { VAD, VADEvent } from '../vad.js';
@@ -20,6 +20,8 @@ declare const HumanInput_base: new () => TypedEmitter<HumanInputCallbacks>;
20
20
  export declare class HumanInput extends HumanInput_base {
21
21
  #private;
22
22
  constructor(room: Room, vad: VAD, stt: STT, participant: RemoteParticipant);
23
+ get participant(): RemoteParticipant;
24
+ get subscribedTrack(): RemoteAudioTrack | undefined;
23
25
  get speaking(): boolean;
24
26
  get speakingProbability(): number;
25
27
  close(): Promise<void>;
@@ -1 +1 @@
1
- {"version":3,"file":"human_input.d.ts","sourceRoot":"","sources":["../../src/pipeline/human_input.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAEV,iBAAiB,EAEjB,IAAI,EACL,MAAM,mBAAmB,CAAC;AAE3B,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAGhF,OAAO,KAAK,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAGtD,OAAO,KAAK,EAAE,GAAG,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAG/C,oBAAY,eAAe;IACzB,eAAe,IAAA;IACf,kBAAkB,IAAA;IAClB,aAAa,IAAA;IACb,gBAAgB,IAAA;IAChB,kBAAkB,IAAA;CACnB;AAED,MAAM,MAAM,mBAAmB,GAAG;IAChC,CAAC,eAAe,CAAC,eAAe,CAAC,EAAE,CAAC,KAAK,EAAE,QAAQ,KAAK,IAAI,CAAC;IAC7D,CAAC,eAAe,CAAC,kBAAkB,CAAC,EAAE,CAAC,KAAK,EAAE,QAAQ,KAAK,IAAI,CAAC;IAChE,CAAC,eAAe,CAAC,aAAa,CAAC,EAAE,CAAC,KAAK,EAAE,QAAQ,KAAK,IAAI,CAAC;IAC3D,CAAC,eAAe,CAAC,gBAAgB,CAAC,EAAE,CAAC,KAAK,EAAE,WAAW,KAAK,IAAI,CAAC;IACjE,CAAC,eAAe,CAAC,kBAAkB,CAAC,EAAE,CAAC,KAAK,EAAE,WAAW,KAAK,IAAI,CAAC;CACpE,CAAC;yCAEyD,aAAa,mBAAmB,CAAC;AAA5F,qBAAa,UAAW,SAAQ,eAA6D;;gBAY/E,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,WAAW,EAAE,iBAAiB;IAmG1E,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAEK,KAAK;CAWZ"}
1
+ {"version":3,"file":"human_input.d.ts","sourceRoot":"","sources":["../../src/pipeline/human_input.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EAEjB,IAAI,EACL,MAAM,mBAAmB,CAAC;AAE3B,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAGhF,OAAO,KAAK,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAGtD,OAAO,KAAK,EAAE,GAAG,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAG/C,oBAAY,eAAe;IACzB,eAAe,IAAA;IACf,kBAAkB,IAAA;IAClB,aAAa,IAAA;IACb,gBAAgB,IAAA;IAChB,kBAAkB,IAAA;CACnB;AAED,MAAM,MAAM,mBAAmB,GAAG;IAChC,CAAC,eAAe,CAAC,eAAe,CAAC,EAAE,CAAC,KAAK,EAAE,QAAQ,KAAK,IAAI,CAAC;IAC7D,CAAC,eAAe,CAAC,kBAAkB,CAAC,EAAE,CAAC,KAAK,EAAE,QAAQ,KAAK,IAAI,CAAC;IAChE,CAAC,eAAe,CAAC,aAAa,CAAC,EAAE,CAAC,KAAK,EAAE,QAAQ,KAAK,IAAI,CAAC;IAC3D,CAAC,eAAe,CAAC,gBAAgB,CAAC,EAAE,CAAC,KAAK,EAAE,WAAW,KAAK,IAAI,CAAC;IACjE,CAAC,eAAe,CAAC,kBAAkB,CAAC,EAAE,CAAC,KAAK,EAAE,WAAW,KAAK,IAAI,CAAC;CACpE,CAAC;yCAEyD,aAAa,mBAAmB,CAAC;AAA5F,qBAAa,UAAW,SAAQ,eAA6D;;gBAY/E,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,WAAW,EAAE,iBAAiB;IAY1E,IAAI,WAAW,IAAI,iBAAiB,CAEnC;IAED,IAAI,eAAe,IAAI,gBAAgB,GAAG,SAAS,CAElD;IAyFD,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAEK,KAAK;CAWZ"}
@@ -33,6 +33,12 @@ class HumanInput extends EventEmitter {
33
33
  this.#room.on(RoomEvent.TrackSubscribed, this.#subscribeToMicrophone.bind(this));
34
34
  this.#subscribeToMicrophone();
35
35
  }
36
+ get participant() {
37
+ return this.#participant;
38
+ }
39
+ get subscribedTrack() {
40
+ return this.#subscribedTrack;
41
+ }
36
42
  #subscribeToMicrophone() {
37
43
  if (!this.#participant) {
38
44
  this.#logger.error("Participant is not set");
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipeline/human_input.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type {\n RemoteAudioTrack,\n RemoteParticipant,\n RemoteTrackPublication,\n Room,\n} from '@livekit/rtc-node';\nimport { AudioStream, RoomEvent, TrackSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport { log } from '../log.js';\nimport type { STT, SpeechEvent } from '../stt/stt.js';\nimport { SpeechEventType } from '../stt/stt.js';\nimport { CancellablePromise, gracefullyCancel } from '../utils.js';\nimport type { VAD, VADEvent } from '../vad.js';\nimport { VADEventType } from '../vad.js';\n\nexport enum HumanInputEvent {\n START_OF_SPEECH,\n VAD_INFERENCE_DONE,\n END_OF_SPEECH,\n FINAL_TRANSCRIPT,\n INTERIM_TRANSCRIPT,\n}\n\nexport type HumanInputCallbacks = {\n [HumanInputEvent.START_OF_SPEECH]: (event: VADEvent) => void;\n [HumanInputEvent.VAD_INFERENCE_DONE]: (event: VADEvent) => void;\n [HumanInputEvent.END_OF_SPEECH]: (event: VADEvent) => void;\n [HumanInputEvent.FINAL_TRANSCRIPT]: (event: SpeechEvent) => void;\n [HumanInputEvent.INTERIM_TRANSCRIPT]: (event: SpeechEvent) => void;\n};\n\nexport class HumanInput extends (EventEmitter as new () => TypedEmitter<HumanInputCallbacks>) {\n #closed = false;\n #room: Room;\n #vad: VAD;\n #stt: STT;\n #participant: RemoteParticipant;\n #subscribedTrack?: RemoteAudioTrack;\n #recognizeTask?: CancellablePromise<void>;\n #speaking = false;\n #speechProbability = 0;\n #logger = log();\n\n constructor(room: Room, vad: VAD, stt: STT, participant: RemoteParticipant) {\n super();\n this.#room = room;\n this.#vad = vad;\n this.#stt = stt;\n this.#participant = participant;\n\n this.#room.on(RoomEvent.TrackPublished, this.#subscribeToMicrophone.bind(this));\n this.#room.on(RoomEvent.TrackSubscribed, this.#subscribeToMicrophone.bind(this));\n this.#subscribeToMicrophone();\n }\n\n #subscribeToMicrophone(): void {\n if (!this.#participant) {\n this.#logger.error('Participant is not set');\n return;\n }\n\n let microphonePublication: RemoteTrackPublication | undefined = undefined;\n for (const publication of this.#participant.trackPublications.values()) {\n if (publication.source === TrackSource.SOURCE_MICROPHONE) {\n microphonePublication = publication;\n break;\n }\n }\n if (!microphonePublication) {\n return;\n }\n\n if (!microphonePublication.subscribed) {\n microphonePublication.setSubscribed(true);\n }\n\n const track = microphonePublication.track;\n if (track && track !== this.#subscribedTrack) {\n this.#subscribedTrack = track;\n if (this.#recognizeTask) {\n this.#recognizeTask.cancel();\n }\n\n const audioStream = new AudioStream(track, 16000);\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n this.#recognizeTask = new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const sttStream = this.#stt.stream();\n const vadStream = this.#vad.stream();\n\n const audioStreamCo = async () => {\n for await (const ev of audioStream) {\n if (cancelled) return;\n sttStream.pushFrame(ev);\n vadStream.pushFrame(ev);\n }\n };\n\n const vadStreamCo = async () => {\n for await (const ev of vadStream) {\n if (cancelled) return;\n switch (ev.type) {\n case VADEventType.START_OF_SPEECH:\n this.#speaking = true;\n this.emit(HumanInputEvent.START_OF_SPEECH, ev);\n break;\n case VADEventType.INFERENCE_DONE:\n this.#speechProbability = ev.probability;\n this.emit(HumanInputEvent.VAD_INFERENCE_DONE, ev);\n break;\n case VADEventType.END_OF_SPEECH:\n this.#speaking = false;\n this.emit(HumanInputEvent.END_OF_SPEECH, ev);\n break;\n }\n }\n };\n\n const sttStreamCo = async () => {\n for await (const ev of sttStream) {\n if (cancelled) return;\n if (ev.type === SpeechEventType.FINAL_TRANSCRIPT) {\n this.emit(HumanInputEvent.FINAL_TRANSCRIPT, ev);\n } else if (ev.type == SpeechEventType.INTERIM_TRANSCRIPT) {\n this.emit(HumanInputEvent.INTERIM_TRANSCRIPT, ev);\n }\n }\n };\n\n await Promise.all([audioStreamCo(), vadStreamCo(), sttStreamCo()]);\n sttStream.close();\n vadStream.close();\n resolve();\n });\n }\n }\n\n get speaking(): boolean {\n return this.#speaking;\n }\n\n get speakingProbability(): number {\n return this.#speechProbability;\n }\n\n async close() {\n if (this.#closed) {\n throw new Error('HumanInput already closed');\n }\n this.#closed = true;\n this.#room.removeAllListeners();\n this.#speaking = false;\n if (this.#recognizeTask) {\n await gracefullyCancel(this.#recognizeTask);\n }\n }\n}\n"],"mappings":"AASA,SAAS,aAAa,WAAW,mBAAmB;AAEpD,SAAS,oBAAoB;AAC7B,SAAS,WAAW;AAEpB,SAAS,uBAAuB;AAChC,SAAS,oBAAoB,wBAAwB;AAErD,SAAS,oBAAoB;AAEtB,IAAK,kBAAL,kBAAKA,qBAAL;AACL,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AALU,SAAAA;AAAA,GAAA;AAgBL,MAAM,mBAAoB,aAA6D;AAAA,EAC5F,UAAU;AAAA,EACV;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA,EACZ,qBAAqB;AAAA,EACrB,UAAU,IAAI;AAAA,EAEd,YAAY,MAAY,KAAU,KAAU,aAAgC;AAC1E,UAAM;AACN,SAAK,QAAQ;AACb,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,eAAe;AAEpB,SAAK,MAAM,GAAG,UAAU,gBAAgB,KAAK,uBAAuB,KAAK,IAAI,CAAC;AAC9E,SAAK,MAAM,GAAG,UAAU,iBAAiB,KAAK,uBAAuB,KAAK,IAAI,CAAC;AAC/E,SAAK,uBAAuB;AAAA,EAC9B;AAAA,EAEA,yBAA+B;AAC7B,QAAI,CAAC,KAAK,cAAc;AACtB,WAAK,QAAQ,MAAM,wBAAwB;AAC3C;AAAA,IACF;AAEA,QAAI,wBAA4D;AAChE,eAAW,eAAe,KAAK,aAAa,kBAAkB,OAAO,GAAG;AACtE,UAAI,YAAY,WAAW,YAAY,mBAAmB;AACxD,gCAAwB;AACxB;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,uBAAuB;AAC1B;AAAA,IACF;AAEA,QAAI,CAAC,sBAAsB,YAAY;AACrC,4BAAsB,cAAc,IAAI;AAAA,IAC1C;AAEA,UAAM,QAAQ,sBAAsB;AACpC,QAAI,SAAS,UAAU,KAAK,kBAAkB;AAC5C,WAAK,mBAAmB;AACxB,UAAI,KAAK,gBAAgB;AACvB,aAAK,eAAe,OAAO;AAAA,MAC7B;AAEA,YAAM,cAAc,IAAI,YAAY,OAAO,IAAK;AAGhD,WAAK,iBAAiB,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC3E,YAAI,YAAY;AAChB,iBAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,cAAM,YAAY,KAAK,KAAK,OAAO;AACnC,cAAM,YAAY,KAAK,KAAK,OAAO;AAEnC,cAAM,gBAAgB,YAAY;AAChC,2BAAiB,MAAM,aAAa;AAClC,gBAAI,UAAW;AACf,sBAAU,UAAU,EAAE;AACtB,sBAAU,UAAU,EAAE;AAAA,UACxB;AAAA,QACF;AAEA,cAAM,cAAc,YAAY;AAC9B,2BAAiB,MAAM,WAAW;AAChC,gBAAI,UAAW;AACf,oBAAQ,GAAG,MAAM;AAAA,cACf,KAAK,aAAa;AAChB,qBAAK,YAAY;AACjB,qBAAK,KAAK,yBAAiC,EAAE;AAC7C;AAAA,cACF,KAAK,aAAa;AAChB,qBAAK,qBAAqB,GAAG;AAC7B,qBAAK,KAAK,4BAAoC,EAAE;AAChD;AAAA,cACF,KAAK,aAAa;AAChB,qBAAK,YAAY;AACjB,qBAAK,KAAK,uBAA+B,EAAE;AAC3C;AAAA,YACJ;AAAA,UACF;AAAA,QACF;AAEA,cAAM,cAAc,YAAY;AAC9B,2BAAiB,MAAM,WAAW;AAChC,gBAAI,UAAW;AACf,gBAAI,GAAG,SAAS,gBAAgB,kBAAkB;AAChD,mBAAK,KAAK,0BAAkC,EAAE;AAAA,YAChD,WAAW,GAAG,QAAQ,gBAAgB,oBAAoB;AACxD,mBAAK,KAAK,4BAAoC,EAAE;AAAA,YAClD;AAAA,UACF;AAAA,QACF;AAEA,cAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,YAAY,GAAG,YAAY,CAAC,CAAC;AACjE,kBAAU,MAAM;AAChB,kBAAU,MAAM;AAChB,gBAAQ;AAAA,MACV,CAAC;AAAA,IACH;AAAA,EACF;AAAA,EAEA,IAAI,WAAoB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,sBAA8B;AAChC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AACA,SAAK,UAAU;AACf,SAAK,MAAM,mBAAmB;AAC9B,SAAK,YAAY;AACjB,QAAI,KAAK,gBAAgB;AACvB,YAAM,iBAAiB,KAAK,cAAc;AAAA,IAC5C;AAAA,EACF;AACF;","names":["HumanInputEvent"]}
1
+ {"version":3,"sources":["../../src/pipeline/human_input.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type {\n RemoteAudioTrack,\n RemoteParticipant,\n RemoteTrackPublication,\n Room,\n} from '@livekit/rtc-node';\nimport { AudioStream, RoomEvent, TrackSource } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport { log } from '../log.js';\nimport type { STT, SpeechEvent } from '../stt/stt.js';\nimport { SpeechEventType } from '../stt/stt.js';\nimport { CancellablePromise, gracefullyCancel } from '../utils.js';\nimport type { VAD, VADEvent } from '../vad.js';\nimport { VADEventType } from '../vad.js';\n\nexport enum HumanInputEvent {\n START_OF_SPEECH,\n VAD_INFERENCE_DONE,\n END_OF_SPEECH,\n FINAL_TRANSCRIPT,\n INTERIM_TRANSCRIPT,\n}\n\nexport type HumanInputCallbacks = {\n [HumanInputEvent.START_OF_SPEECH]: (event: VADEvent) => void;\n [HumanInputEvent.VAD_INFERENCE_DONE]: (event: VADEvent) => void;\n [HumanInputEvent.END_OF_SPEECH]: (event: VADEvent) => void;\n [HumanInputEvent.FINAL_TRANSCRIPT]: (event: SpeechEvent) => void;\n [HumanInputEvent.INTERIM_TRANSCRIPT]: (event: SpeechEvent) => void;\n};\n\nexport class HumanInput extends (EventEmitter as new () => TypedEmitter<HumanInputCallbacks>) {\n #closed = false;\n #room: Room;\n #vad: VAD;\n #stt: STT;\n #participant: RemoteParticipant;\n #subscribedTrack?: RemoteAudioTrack;\n #recognizeTask?: CancellablePromise<void>;\n #speaking = false;\n #speechProbability = 0;\n #logger = log();\n\n constructor(room: Room, vad: VAD, stt: STT, participant: RemoteParticipant) {\n super();\n this.#room = room;\n this.#vad = vad;\n this.#stt = stt;\n this.#participant = participant;\n\n this.#room.on(RoomEvent.TrackPublished, this.#subscribeToMicrophone.bind(this));\n this.#room.on(RoomEvent.TrackSubscribed, this.#subscribeToMicrophone.bind(this));\n this.#subscribeToMicrophone();\n }\n\n get participant(): RemoteParticipant {\n return this.#participant;\n }\n\n get subscribedTrack(): RemoteAudioTrack | undefined {\n return this.#subscribedTrack;\n }\n\n #subscribeToMicrophone(): void {\n if (!this.#participant) {\n this.#logger.error('Participant is not set');\n return;\n }\n\n let microphonePublication: RemoteTrackPublication | undefined = undefined;\n for (const publication of this.#participant.trackPublications.values()) {\n if (publication.source === TrackSource.SOURCE_MICROPHONE) {\n microphonePublication = publication;\n break;\n }\n }\n if (!microphonePublication) {\n return;\n }\n\n if (!microphonePublication.subscribed) {\n microphonePublication.setSubscribed(true);\n }\n\n const track = microphonePublication.track;\n if (track && track !== this.#subscribedTrack) {\n this.#subscribedTrack = track;\n if (this.#recognizeTask) {\n this.#recognizeTask.cancel();\n }\n\n const audioStream = new AudioStream(track, 16000);\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n this.#recognizeTask = new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n const sttStream = this.#stt.stream();\n const vadStream = this.#vad.stream();\n\n const audioStreamCo = async () => {\n for await (const ev of audioStream) {\n if (cancelled) return;\n sttStream.pushFrame(ev);\n vadStream.pushFrame(ev);\n }\n };\n\n const vadStreamCo = async () => {\n for await (const ev of vadStream) {\n if (cancelled) return;\n switch (ev.type) {\n case VADEventType.START_OF_SPEECH:\n this.#speaking = true;\n this.emit(HumanInputEvent.START_OF_SPEECH, ev);\n break;\n case VADEventType.INFERENCE_DONE:\n this.#speechProbability = ev.probability;\n this.emit(HumanInputEvent.VAD_INFERENCE_DONE, ev);\n break;\n case VADEventType.END_OF_SPEECH:\n this.#speaking = false;\n this.emit(HumanInputEvent.END_OF_SPEECH, ev);\n break;\n }\n }\n };\n\n const sttStreamCo = async () => {\n for await (const ev of sttStream) {\n if (cancelled) return;\n if (ev.type === SpeechEventType.FINAL_TRANSCRIPT) {\n this.emit(HumanInputEvent.FINAL_TRANSCRIPT, ev);\n } else if (ev.type == SpeechEventType.INTERIM_TRANSCRIPT) {\n this.emit(HumanInputEvent.INTERIM_TRANSCRIPT, ev);\n }\n }\n };\n\n await Promise.all([audioStreamCo(), vadStreamCo(), sttStreamCo()]);\n sttStream.close();\n vadStream.close();\n resolve();\n });\n }\n }\n\n get speaking(): boolean {\n return this.#speaking;\n }\n\n get speakingProbability(): number {\n return this.#speechProbability;\n }\n\n async close() {\n if (this.#closed) {\n throw new Error('HumanInput already closed');\n }\n this.#closed = true;\n this.#room.removeAllListeners();\n this.#speaking = false;\n if (this.#recognizeTask) {\n await gracefullyCancel(this.#recognizeTask);\n }\n }\n}\n"],"mappings":"AASA,SAAS,aAAa,WAAW,mBAAmB;AAEpD,SAAS,oBAAoB;AAC7B,SAAS,WAAW;AAEpB,SAAS,uBAAuB;AAChC,SAAS,oBAAoB,wBAAwB;AAErD,SAAS,oBAAoB;AAEtB,IAAK,kBAAL,kBAAKA,qBAAL;AACL,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AACA,EAAAA,kCAAA;AALU,SAAAA;AAAA,GAAA;AAgBL,MAAM,mBAAoB,aAA6D;AAAA,EAC5F,UAAU;AAAA,EACV;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA,EACZ,qBAAqB;AAAA,EACrB,UAAU,IAAI;AAAA,EAEd,YAAY,MAAY,KAAU,KAAU,aAAgC;AAC1E,UAAM;AACN,SAAK,QAAQ;AACb,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,eAAe;AAEpB,SAAK,MAAM,GAAG,UAAU,gBAAgB,KAAK,uBAAuB,KAAK,IAAI,CAAC;AAC9E,SAAK,MAAM,GAAG,UAAU,iBAAiB,KAAK,uBAAuB,KAAK,IAAI,CAAC;AAC/E,SAAK,uBAAuB;AAAA,EAC9B;AAAA,EAEA,IAAI,cAAiC;AACnC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,kBAAgD;AAClD,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,yBAA+B;AAC7B,QAAI,CAAC,KAAK,cAAc;AACtB,WAAK,QAAQ,MAAM,wBAAwB;AAC3C;AAAA,IACF;AAEA,QAAI,wBAA4D;AAChE,eAAW,eAAe,KAAK,aAAa,kBAAkB,OAAO,GAAG;AACtE,UAAI,YAAY,WAAW,YAAY,mBAAmB;AACxD,gCAAwB;AACxB;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,uBAAuB;AAC1B;AAAA,IACF;AAEA,QAAI,CAAC,sBAAsB,YAAY;AACrC,4BAAsB,cAAc,IAAI;AAAA,IAC1C;AAEA,UAAM,QAAQ,sBAAsB;AACpC,QAAI,SAAS,UAAU,KAAK,kBAAkB;AAC5C,WAAK,mBAAmB;AACxB,UAAI,KAAK,gBAAgB;AACvB,aAAK,eAAe,OAAO;AAAA,MAC7B;AAEA,YAAM,cAAc,IAAI,YAAY,OAAO,IAAK;AAGhD,WAAK,iBAAiB,IAAI,mBAAmB,OAAO,SAAS,GAAG,aAAa;AAC3E,YAAI,YAAY;AAChB,iBAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AAED,cAAM,YAAY,KAAK,KAAK,OAAO;AACnC,cAAM,YAAY,KAAK,KAAK,OAAO;AAEnC,cAAM,gBAAgB,YAAY;AAChC,2BAAiB,MAAM,aAAa;AAClC,gBAAI,UAAW;AACf,sBAAU,UAAU,EAAE;AACtB,sBAAU,UAAU,EAAE;AAAA,UACxB;AAAA,QACF;AAEA,cAAM,cAAc,YAAY;AAC9B,2BAAiB,MAAM,WAAW;AAChC,gBAAI,UAAW;AACf,oBAAQ,GAAG,MAAM;AAAA,cACf,KAAK,aAAa;AAChB,qBAAK,YAAY;AACjB,qBAAK,KAAK,yBAAiC,EAAE;AAC7C;AAAA,cACF,KAAK,aAAa;AAChB,qBAAK,qBAAqB,GAAG;AAC7B,qBAAK,KAAK,4BAAoC,EAAE;AAChD;AAAA,cACF,KAAK,aAAa;AAChB,qBAAK,YAAY;AACjB,qBAAK,KAAK,uBAA+B,EAAE;AAC3C;AAAA,YACJ;AAAA,UACF;AAAA,QACF;AAEA,cAAM,cAAc,YAAY;AAC9B,2BAAiB,MAAM,WAAW;AAChC,gBAAI,UAAW;AACf,gBAAI,GAAG,SAAS,gBAAgB,kBAAkB;AAChD,mBAAK,KAAK,0BAAkC,EAAE;AAAA,YAChD,WAAW,GAAG,QAAQ,gBAAgB,oBAAoB;AACxD,mBAAK,KAAK,4BAAoC,EAAE;AAAA,YAClD;AAAA,UACF;AAAA,QACF;AAEA,cAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,YAAY,GAAG,YAAY,CAAC,CAAC;AACjE,kBAAU,MAAM;AAChB,kBAAU,MAAM;AAChB,gBAAQ;AAAA,MACV,CAAC;AAAA,IACH;AAAA,EACF;AAAA,EAEA,IAAI,WAAoB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,sBAA8B;AAChC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,QAAQ;AACZ,QAAI,KAAK,SAAS;AAChB,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AACA,SAAK,UAAU;AACf,SAAK,MAAM,mBAAmB;AAC9B,SAAK,YAAY;AACjB,QAAI,KAAK,gBAAgB;AACvB,YAAM,iBAAiB,KAAK,cAAc;AAAA,IAC5C;AAAA,EACF;AACF;","names":["HumanInputEvent"]}
@@ -35,12 +35,14 @@ __export(pipeline_agent_exports, {
35
35
  });
36
36
  module.exports = __toCommonJS(pipeline_agent_exports);
37
37
  var import_rtc_node = require("@livekit/rtc-node");
38
+ var import_node_crypto = require("node:crypto");
38
39
  var import_node_events = __toESM(require("node:events"), 1);
39
40
  var import_llm = require("../llm/index.cjs");
40
41
  var import_llm2 = require("../llm/index.cjs");
41
42
  var import_log = require("../log.cjs");
42
43
  var import_stt = require("../stt/index.cjs");
43
44
  var import_basic = require("../tokenize/basic/index.cjs");
45
+ var import_transcription = require("../transcription.cjs");
44
46
  var import_tts = require("../tts/index.cjs");
45
47
  var import_utils = require("../utils.cjs");
46
48
  var import_vad = require("../vad.cjs");
@@ -137,7 +139,7 @@ class VoicePipelineAgent extends import_node_events.default {
137
139
  #pendingAgentReply;
138
140
  #agentReplyTask;
139
141
  #playingSpeech;
140
- #transcribedText = "";
142
+ transcribedText = "";
141
143
  #transcribedInterimText = "";
142
144
  #speechQueueOpen = new import_utils.Future();
143
145
  #speechQueue = new import_utils.AsyncIterableQueue();
@@ -150,6 +152,8 @@ class VoicePipelineAgent extends import_node_events.default {
150
152
  #agentPublication;
151
153
  #lastFinalTranscriptTime;
152
154
  #lastSpeechTime;
155
+ #transcriptionId;
156
+ #agentTranscribedText = "";
153
157
  constructor(vad, stt, llm, tts, opts = defaultVPAOptions) {
154
158
  super();
155
159
  this.#opts = { ...defaultVPAOptions, ...opts };
@@ -165,7 +169,9 @@ class VoicePipelineAgent extends import_node_events.default {
165
169
  this.#tts = tts;
166
170
  this.#deferredValidation = new DeferredReplyValidation(
167
171
  this.#validateReplyIfPossible.bind(this),
168
- this.#opts.minEndpointingDelay
172
+ this.#opts.minEndpointingDelay,
173
+ this,
174
+ this.#opts.turnDetector
169
175
  );
170
176
  }
171
177
  get fncCtx() {
@@ -327,13 +333,48 @@ class VoicePipelineAgent extends import_node_events.default {
327
333
  this.#deferredValidation.onHumanEndOfSpeech(event);
328
334
  });
329
335
  this.#humanInput.on(import_human_input.HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
336
+ if (!this.#transcriptionId) {
337
+ this.#transcriptionId = (0, import_node_crypto.randomUUID)();
338
+ }
330
339
  this.#transcribedInterimText = event.alternatives[0].text;
340
+ this.#room.localParticipant.publishTranscription({
341
+ participantIdentity: this.#humanInput.participant.identity,
342
+ trackSid: this.#humanInput.subscribedTrack.sid,
343
+ segments: [
344
+ {
345
+ text: this.#transcribedInterimText,
346
+ id: this.#transcriptionId,
347
+ final: true,
348
+ startTime: BigInt(0),
349
+ endTime: BigInt(0),
350
+ language: ""
351
+ }
352
+ ]
353
+ });
331
354
  });
332
355
  this.#humanInput.on(import_human_input.HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
333
356
  const newTranscript = event.alternatives[0].text;
334
357
  if (!newTranscript) return;
358
+ if (!this.#transcriptionId) {
359
+ this.#transcriptionId = (0, import_node_crypto.randomUUID)();
360
+ }
335
361
  this.#lastFinalTranscriptTime = Date.now();
336
- this.#transcribedText += (this.#transcribedText ? " " : "") + newTranscript;
362
+ this.transcribedText += (this.transcribedText ? " " : "") + newTranscript;
363
+ this.#room.localParticipant.publishTranscription({
364
+ participantIdentity: this.#humanInput.participant.identity,
365
+ trackSid: this.#humanInput.subscribedTrack.sid,
366
+ segments: [
367
+ {
368
+ text: this.transcribedText,
369
+ id: this.#transcriptionId,
370
+ final: true,
371
+ startTime: BigInt(0),
372
+ endTime: BigInt(0),
373
+ language: ""
374
+ }
375
+ ]
376
+ });
377
+ this.#transcriptionId = void 0;
337
378
  if (this.#opts.preemptiveSynthesis && (!this.#playingSpeech || this.#playingSpeech.allowInterruptions)) {
338
379
  this.#synthesizeAgentReply();
339
380
  }
@@ -384,7 +425,7 @@ class VoicePipelineAgent extends import_node_events.default {
384
425
  this.#pendingAgentReply = import_speech_handle.SpeechHandle.createAssistantReply(
385
426
  this.#opts.allowInterruptions,
386
427
  true,
387
- this.#transcribedText
428
+ this.transcribedText
388
429
  );
389
430
  const newHandle = this.#pendingAgentReply;
390
431
  this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);
@@ -461,7 +502,7 @@ class VoicePipelineAgent extends import_node_events.default {
461
502
  const userMsg = import_llm2.ChatMessage.create({ text: userQuestion, role: import_llm2.ChatRole.USER });
462
503
  this.chatCtx.messages.push(userMsg);
463
504
  this.emit(4 /* USER_SPEECH_COMMITTED */, userMsg);
464
- this.#transcribedText = this.#transcribedText.slice(userQuestion.length);
505
+ this.transcribedText = this.transcribedText.slice(userQuestion.length);
465
506
  handle.markUserCommitted();
466
507
  };
467
508
  commitUserQuestionIfNeeded();
@@ -475,7 +516,7 @@ class VoicePipelineAgent extends import_node_events.default {
475
516
  if (handle.interrupted) break;
476
517
  }
477
518
  commitUserQuestionIfNeeded();
478
- const collectedText = handle.synthesisHandle.text;
519
+ let collectedText = this.#agentTranscribedText;
479
520
  const isUsingTools = handle.source instanceof import_llm.LLMStream && !!handle.source.functionCalls.length;
480
521
  const interrupted = handle.interrupted;
481
522
  if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
@@ -483,7 +524,7 @@ class VoicePipelineAgent extends import_node_events.default {
483
524
  this.chatCtx.messages.push(...handle.extraToolsMessages);
484
525
  }
485
526
  if (interrupted) {
486
- collectedText + "\u2026";
527
+ collectedText += "\u2026";
487
528
  }
488
529
  const msg = import_llm2.ChatMessage.create({ text: collectedText, role: import_llm2.ChatRole.ASSISTANT });
489
530
  this.chatCtx.messages.push(msg);
@@ -578,6 +619,15 @@ class VoicePipelineAgent extends import_node_events.default {
578
619
  handle.setDone();
579
620
  }
580
621
  #synthesizeAgentSpeech(speechId, source) {
622
+ const synchronizer = new import_transcription.TextAudioSynchronizer(import_transcription.defaultTextSyncOptions);
623
+ synchronizer.on("textUpdated", (text) => {
624
+ this.#agentTranscribedText = text.text;
625
+ this.#room.localParticipant.publishTranscription({
626
+ participantIdentity: this.#room.localParticipant.identity,
627
+ trackSid: this.#agentPublication.sid,
628
+ segments: [text]
629
+ });
630
+ });
581
631
  if (!this.#agentOutput) {
582
632
  throw new Error("agent output should be initialized when ready");
583
633
  }
@@ -591,7 +641,7 @@ class VoicePipelineAgent extends import_node_events.default {
591
641
  if (!ttsSource) {
592
642
  throw new Error("beforeTTSCallback must return string or AsyncIterable<string>");
593
643
  }
594
- return this.#agentOutput.synthesize(speechId, ttsSource);
644
+ return this.#agentOutput.synthesize(speechId, ttsSource, synchronizer);
595
645
  }
596
646
  async #validateReplyIfPossible() {
597
647
  if (this.#playingSpeech && !this.#playingSpeech.allowInterruptions) {
@@ -599,7 +649,7 @@ class VoicePipelineAgent extends import_node_events.default {
599
649
  return;
600
650
  }
601
651
  if (!this.#pendingAgentReply) {
602
- if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {
652
+ if (this.#opts.preemptiveSynthesis || !this.transcribedText) {
603
653
  return;
604
654
  }
605
655
  this.#synthesizeAgentReply();
@@ -681,6 +731,7 @@ class DeferredReplyValidation {
681
731
  PUNCTUATION_REDUCE_FACTOR = 0.75;
682
732
  LATE_TRANSCRIPT_TOLERANCE = 1.5;
683
733
  // late compared to end of speech
734
+ UNLIKELY_ENDPOINT_DELAY = 6e3;
684
735
  #validateFunc;
685
736
  #validatingPromise;
686
737
  #validatingFuture = new import_utils.Future();
@@ -689,11 +740,15 @@ class DeferredReplyValidation {
689
740
  #speaking = false;
690
741
  #endOfSpeechDelay;
691
742
  #finalTranscriptDelay;
743
+ #turnDetector;
744
+ #agent;
692
745
  #abort;
693
- constructor(validateFunc, minEndpointingDelay) {
746
+ constructor(validateFunc, minEndpointingDelay, agent, turnDetector) {
694
747
  this.#validateFunc = validateFunc;
695
748
  this.#endOfSpeechDelay = minEndpointingDelay;
696
749
  this.#finalTranscriptDelay = minEndpointingDelay;
750
+ this.#agent = agent;
751
+ this.#turnDetector = turnDetector;
697
752
  }
698
753
  get validating() {
699
754
  return !this.#validatingFuture.done;
@@ -733,7 +788,17 @@ class DeferredReplyValidation {
733
788
  }
734
789
  #run(delay) {
735
790
  var _a;
736
- const runTask = async (delay2, signal) => {
791
+ const runTask = async (delay2, chatCtx, signal) => {
792
+ if (this.#lastFinalTranscript && !this.#speaking && this.#turnDetector) {
793
+ const startTime = Date.now();
794
+ const eotProb = await this.#turnDetector.predictEndOfTurn(chatCtx);
795
+ const unlikelyThreshold = this.#turnDetector.unlikelyThreshold;
796
+ const elapsed = Date.now() - startTime;
797
+ if (eotProb < unlikelyThreshold) {
798
+ delay2 = this.UNLIKELY_ENDPOINT_DELAY;
799
+ }
800
+ delay2 = Math.max(0, delay2 - elapsed);
801
+ }
737
802
  const timeout = setTimeout(() => {
738
803
  this.#resetStates();
739
804
  this.#validateFunc();
@@ -745,7 +810,9 @@ class DeferredReplyValidation {
745
810
  (_a = this.#abort) == null ? void 0 : _a.abort();
746
811
  this.#abort = new AbortController();
747
812
  this.#validatingFuture = new import_utils.Future();
748
- this.#validatingPromise = runTask(delay, this.#abort.signal);
813
+ const detectCtx = this.#agent.chatCtx.copy();
814
+ detectCtx.append({ text: this.#agent.transcribedText, role: import_llm2.ChatRole.USER });
815
+ this.#validatingPromise = runTask(delay, detectCtx, this.#abort.signal);
749
816
  }
750
817
  }
751
818
  // Annotate the CommonJS export names for ESM import in node: