@livekit/agents 0.7.3 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/dist/audio.cjs +1 -1
  2. package/dist/audio.cjs.map +1 -1
  3. package/dist/audio.js +1 -1
  4. package/dist/audio.js.map +1 -1
  5. package/dist/constants.cjs +38 -0
  6. package/dist/constants.cjs.map +1 -0
  7. package/dist/constants.d.ts +5 -0
  8. package/dist/constants.d.ts.map +1 -0
  9. package/dist/constants.js +11 -0
  10. package/dist/constants.js.map +1 -0
  11. package/dist/inference_runner.cjs.map +1 -1
  12. package/dist/inference_runner.d.ts +1 -0
  13. package/dist/inference_runner.d.ts.map +1 -1
  14. package/dist/inference_runner.js.map +1 -1
  15. package/dist/ipc/inference_proc_lazy_main.cjs +19 -27
  16. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  17. package/dist/ipc/inference_proc_lazy_main.js +19 -5
  18. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  19. package/dist/ipc/job_proc_lazy_main.cjs +23 -10
  20. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  21. package/dist/ipc/job_proc_lazy_main.js +23 -10
  22. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  23. package/dist/ipc/supervised_proc.cjs +4 -5
  24. package/dist/ipc/supervised_proc.cjs.map +1 -1
  25. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  26. package/dist/ipc/supervised_proc.js +4 -5
  27. package/dist/ipc/supervised_proc.js.map +1 -1
  28. package/dist/multimodal/multimodal_agent.cjs +26 -9
  29. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  30. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  31. package/dist/multimodal/multimodal_agent.js +30 -9
  32. package/dist/multimodal/multimodal_agent.js.map +1 -1
  33. package/dist/pipeline/agent_playout.cjs +1 -1
  34. package/dist/pipeline/agent_playout.cjs.map +1 -1
  35. package/dist/pipeline/agent_playout.d.ts.map +1 -1
  36. package/dist/pipeline/agent_playout.js +1 -1
  37. package/dist/pipeline/agent_playout.js.map +1 -1
  38. package/dist/pipeline/pipeline_agent.cjs +52 -36
  39. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  40. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  41. package/dist/pipeline/pipeline_agent.js +56 -36
  42. package/dist/pipeline/pipeline_agent.js.map +1 -1
  43. package/package.json +3 -3
  44. package/src/audio.ts +1 -1
  45. package/src/constants.ts +7 -0
  46. package/src/inference_runner.ts +1 -0
  47. package/src/ipc/inference_proc_lazy_main.ts +27 -6
  48. package/src/ipc/job_proc_lazy_main.ts +27 -9
  49. package/src/ipc/supervised_proc.ts +5 -6
  50. package/src/multimodal/multimodal_agent.ts +32 -10
  51. package/src/pipeline/agent_playout.ts +1 -7
  52. package/src/pipeline/pipeline_agent.ts +64 -36
@@ -168,15 +168,9 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
168
168
  handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;
169
169
  handle.synchronizer.pushAudio(frame);
170
170
  await this.#audioSource.captureFrame(frame);
171
- await this.#audioSource.waitForPlayout();
172
171
  }
173
172
 
174
- // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,
175
- // but for some reason too many TTS frames can gunk up the buffer and lead to
176
- // FFI errors. this works 🤷‍♀️
177
- // if (this.#audioSource.queuedDuration > 0) {
178
- // await this.#audioSource.waitForPlayout();
179
- // }
173
+ await this.#audioSource.waitForPlayout();
180
174
 
181
175
  handle.synchronizer.close(false);
182
176
  resolve();
@@ -17,6 +17,11 @@ import {
17
17
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
18
18
  import { randomUUID } from 'node:crypto';
19
19
  import EventEmitter from 'node:events';
20
+ import {
21
+ ATTRIBUTE_TRANSCRIPTION_FINAL,
22
+ ATTRIBUTE_TRANSCRIPTION_TRACK_ID,
23
+ TOPIC_TRANSCRIPTION,
24
+ } from '../constants.js';
20
25
  import type {
21
26
  CallableFunctionResult,
22
27
  FunctionCallInfo,
@@ -518,28 +523,21 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
518
523
  this.emit(VPAEvent.USER_STOPPED_SPEAKING);
519
524
  this.#deferredValidation.onHumanEndOfSpeech(event);
520
525
  });
521
- this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
526
+ this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, async (event) => {
522
527
  if (!this.#transcriptionId) {
523
528
  this.#transcriptionId = randomUUID();
524
529
  }
525
530
  this.#transcribedInterimText = event.alternatives![0].text;
526
531
 
527
- this.#room!.localParticipant!.publishTranscription({
528
- participantIdentity: this.#humanInput!.participant.identity,
529
- trackSid: this.#humanInput!.subscribedTrack!.sid!,
530
- segments: [
531
- {
532
- text: this.#transcribedInterimText,
533
- id: this.#transcriptionId,
534
- final: true,
535
- startTime: BigInt(0),
536
- endTime: BigInt(0),
537
- language: '',
538
- },
539
- ],
540
- });
532
+ await this.#publishTranscription(
533
+ this.#humanInput!.participant.identity,
534
+ this.#humanInput!.subscribedTrack!.sid!,
535
+ this.#transcribedInterimText,
536
+ false,
537
+ this.#transcriptionId,
538
+ );
541
539
  });
542
- this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
540
+ this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, async (event) => {
543
541
  const newTranscript = event.alternatives![0].text;
544
542
  if (!newTranscript) return;
545
543
 
@@ -550,20 +548,14 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
550
548
  this.#lastFinalTranscriptTime = Date.now();
551
549
  this.transcribedText += (this.transcribedText ? ' ' : '') + newTranscript;
552
550
 
553
- this.#room!.localParticipant!.publishTranscription({
554
- participantIdentity: this.#humanInput!.participant.identity,
555
- trackSid: this.#humanInput!.subscribedTrack!.sid!,
556
- segments: [
557
- {
558
- text: this.transcribedText,
559
- id: this.#transcriptionId,
560
- final: true,
561
- startTime: BigInt(0),
562
- endTime: BigInt(0),
563
- language: '',
564
- },
565
- ],
566
- });
551
+ await this.#publishTranscription(
552
+ this.#humanInput!.participant.identity,
553
+ this.#humanInput!.subscribedTrack!.sid!,
554
+ this.transcribedText,
555
+ true,
556
+ this.#transcriptionId,
557
+ );
558
+
567
559
  this.#transcriptionId = undefined;
568
560
 
569
561
  if (
@@ -894,18 +886,54 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
894
886
  handle.setDone();
895
887
  }
896
888
 
889
+ async #publishTranscription(
890
+ participantIdentity: string,
891
+ trackSid: string,
892
+ text: string,
893
+ isFinal: boolean,
894
+ id: string,
895
+ ) {
896
+ this.#room!.localParticipant!.publishTranscription({
897
+ participantIdentity: participantIdentity,
898
+ trackSid: trackSid,
899
+ segments: [
900
+ {
901
+ text: text,
902
+ final: isFinal,
903
+ id: id,
904
+ startTime: BigInt(0),
905
+ endTime: BigInt(0),
906
+ language: '',
907
+ },
908
+ ],
909
+ });
910
+ const stream = await this.#room!.localParticipant!.streamText({
911
+ senderIdentity: participantIdentity,
912
+ topic: TOPIC_TRANSCRIPTION,
913
+ attributes: {
914
+ [ATTRIBUTE_TRANSCRIPTION_TRACK_ID]: trackSid,
915
+ [ATTRIBUTE_TRANSCRIPTION_FINAL]: isFinal.toString(),
916
+ },
917
+ });
918
+ await stream.write(text);
919
+ await stream.close();
920
+ }
921
+
897
922
  #synthesizeAgentSpeech(
898
923
  speechId: string,
899
924
  source: string | LLMStream | AsyncIterable<string>,
900
925
  ): SynthesisHandle {
901
926
  const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);
902
- synchronizer.on('textUpdated', (text) => {
927
+ // TODO: where possible we would want to use deltas instead of full text segments, esp for LLM streams over the streamText API
928
+ synchronizer.on('textUpdated', async (text) => {
903
929
  this.#agentTranscribedText = text.text;
904
- this.#room!.localParticipant!.publishTranscription({
905
- participantIdentity: this.#room!.localParticipant!.identity,
906
- trackSid: this.#agentPublication!.sid!,
907
- segments: [text],
908
- });
930
+ await this.#publishTranscription(
931
+ this.#room!.localParticipant!.identity!,
932
+ this.#agentPublication?.sid ?? '',
933
+ text.text,
934
+ text.final,
935
+ text.id,
936
+ );
909
937
  });
910
938
 
911
939
  if (!this.#agentOutput) {