@livekit/agents 0.7.3 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/audio.cjs +1 -1
  2. package/dist/audio.cjs.map +1 -1
  3. package/dist/audio.js +1 -1
  4. package/dist/audio.js.map +1 -1
  5. package/dist/constants.cjs +38 -0
  6. package/dist/constants.cjs.map +1 -0
  7. package/dist/constants.d.ts +5 -0
  8. package/dist/constants.d.ts.map +1 -0
  9. package/dist/constants.js +11 -0
  10. package/dist/constants.js.map +1 -0
  11. package/dist/ipc/inference_proc_lazy_main.cjs +14 -27
  12. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  13. package/dist/ipc/inference_proc_lazy_main.js +14 -5
  14. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  15. package/dist/ipc/job_proc_lazy_main.cjs +23 -10
  16. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  17. package/dist/ipc/job_proc_lazy_main.js +23 -10
  18. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  19. package/dist/ipc/supervised_proc.cjs +4 -5
  20. package/dist/ipc/supervised_proc.cjs.map +1 -1
  21. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  22. package/dist/ipc/supervised_proc.js +4 -5
  23. package/dist/ipc/supervised_proc.js.map +1 -1
  24. package/dist/multimodal/multimodal_agent.cjs +26 -9
  25. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  26. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  27. package/dist/multimodal/multimodal_agent.js +30 -9
  28. package/dist/multimodal/multimodal_agent.js.map +1 -1
  29. package/dist/pipeline/agent_playout.cjs +1 -1
  30. package/dist/pipeline/agent_playout.cjs.map +1 -1
  31. package/dist/pipeline/agent_playout.d.ts.map +1 -1
  32. package/dist/pipeline/agent_playout.js +1 -1
  33. package/dist/pipeline/agent_playout.js.map +1 -1
  34. package/dist/pipeline/pipeline_agent.cjs +52 -36
  35. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  36. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  37. package/dist/pipeline/pipeline_agent.js +56 -36
  38. package/dist/pipeline/pipeline_agent.js.map +1 -1
  39. package/package.json +1 -1
  40. package/src/audio.ts +1 -1
  41. package/src/constants.ts +7 -0
  42. package/src/ipc/inference_proc_lazy_main.ts +21 -6
  43. package/src/ipc/job_proc_lazy_main.ts +27 -9
  44. package/src/ipc/supervised_proc.ts +5 -6
  45. package/src/multimodal/multimodal_agent.ts +32 -10
  46. package/src/pipeline/agent_playout.ts +1 -7
  47. package/src/pipeline/pipeline_agent.ts +64 -36
@@ -17,6 +17,11 @@ import {
17
17
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
18
18
  import { randomUUID } from 'node:crypto';
19
19
  import EventEmitter from 'node:events';
20
+ import {
21
+ ATTRIBUTE_TRANSCRIPTION_FINAL,
22
+ ATTRIBUTE_TRANSCRIPTION_TRACK_ID,
23
+ TOPIC_TRANSCRIPTION,
24
+ } from '../constants.js';
20
25
  import type {
21
26
  CallableFunctionResult,
22
27
  FunctionCallInfo,
@@ -518,28 +523,21 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
518
523
  this.emit(VPAEvent.USER_STOPPED_SPEAKING);
519
524
  this.#deferredValidation.onHumanEndOfSpeech(event);
520
525
  });
521
- this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
526
+ this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, async (event) => {
522
527
  if (!this.#transcriptionId) {
523
528
  this.#transcriptionId = randomUUID();
524
529
  }
525
530
  this.#transcribedInterimText = event.alternatives![0].text;
526
531
 
527
- this.#room!.localParticipant!.publishTranscription({
528
- participantIdentity: this.#humanInput!.participant.identity,
529
- trackSid: this.#humanInput!.subscribedTrack!.sid!,
530
- segments: [
531
- {
532
- text: this.#transcribedInterimText,
533
- id: this.#transcriptionId,
534
- final: true,
535
- startTime: BigInt(0),
536
- endTime: BigInt(0),
537
- language: '',
538
- },
539
- ],
540
- });
532
+ await this.#publishTranscription(
533
+ this.#humanInput!.participant.identity,
534
+ this.#humanInput!.subscribedTrack!.sid!,
535
+ this.#transcribedInterimText,
536
+ false,
537
+ this.#transcriptionId,
538
+ );
541
539
  });
542
- this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
540
+ this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, async (event) => {
543
541
  const newTranscript = event.alternatives![0].text;
544
542
  if (!newTranscript) return;
545
543
 
@@ -550,20 +548,14 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
550
548
  this.#lastFinalTranscriptTime = Date.now();
551
549
  this.transcribedText += (this.transcribedText ? ' ' : '') + newTranscript;
552
550
 
553
- this.#room!.localParticipant!.publishTranscription({
554
- participantIdentity: this.#humanInput!.participant.identity,
555
- trackSid: this.#humanInput!.subscribedTrack!.sid!,
556
- segments: [
557
- {
558
- text: this.transcribedText,
559
- id: this.#transcriptionId,
560
- final: true,
561
- startTime: BigInt(0),
562
- endTime: BigInt(0),
563
- language: '',
564
- },
565
- ],
566
- });
551
+ await this.#publishTranscription(
552
+ this.#humanInput!.participant.identity,
553
+ this.#humanInput!.subscribedTrack!.sid!,
554
+ this.transcribedText,
555
+ true,
556
+ this.#transcriptionId,
557
+ );
558
+
567
559
  this.#transcriptionId = undefined;
568
560
 
569
561
  if (
@@ -894,18 +886,54 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
894
886
  handle.setDone();
895
887
  }
896
888
 
889
+ async #publishTranscription(
890
+ participantIdentity: string,
891
+ trackSid: string,
892
+ text: string,
893
+ isFinal: boolean,
894
+ id: string,
895
+ ) {
896
+ this.#room!.localParticipant!.publishTranscription({
897
+ participantIdentity: participantIdentity,
898
+ trackSid: trackSid,
899
+ segments: [
900
+ {
901
+ text: text,
902
+ final: isFinal,
903
+ id: id,
904
+ startTime: BigInt(0),
905
+ endTime: BigInt(0),
906
+ language: '',
907
+ },
908
+ ],
909
+ });
910
+ const stream = await this.#room!.localParticipant!.streamText({
911
+ senderIdentity: participantIdentity,
912
+ topic: TOPIC_TRANSCRIPTION,
913
+ attributes: {
914
+ [ATTRIBUTE_TRANSCRIPTION_TRACK_ID]: trackSid,
915
+ [ATTRIBUTE_TRANSCRIPTION_FINAL]: isFinal.toString(),
916
+ },
917
+ });
918
+ await stream.write(text);
919
+ await stream.close();
920
+ }
921
+
897
922
  #synthesizeAgentSpeech(
898
923
  speechId: string,
899
924
  source: string | LLMStream | AsyncIterable<string>,
900
925
  ): SynthesisHandle {
901
926
  const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);
902
- synchronizer.on('textUpdated', (text) => {
927
+ // TODO: where possible we would want to use deltas instead of full text segments, esp for LLM streams over the streamText API
928
+ synchronizer.on('textUpdated', async (text) => {
903
929
  this.#agentTranscribedText = text.text;
904
- this.#room!.localParticipant!.publishTranscription({
905
- participantIdentity: this.#room!.localParticipant!.identity,
906
- trackSid: this.#agentPublication!.sid!,
907
- segments: [text],
908
- });
930
+ await this.#publishTranscription(
931
+ this.#room!.localParticipant!.identity!,
932
+ this.#agentPublication?.sid ?? '',
933
+ text.text,
934
+ text.final,
935
+ text.id,
936
+ );
909
937
  });
910
938
 
911
939
  if (!this.#agentOutput) {