@livekit/agents 0.5.2 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +2 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +2 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/job.cjs.map +1 -1
  8. package/dist/job.js.map +1 -1
  9. package/dist/llm/index.cjs +2 -0
  10. package/dist/llm/index.cjs.map +1 -1
  11. package/dist/llm/index.d.ts +1 -1
  12. package/dist/llm/index.d.ts.map +1 -1
  13. package/dist/llm/index.js +2 -0
  14. package/dist/llm/index.js.map +1 -1
  15. package/dist/llm/llm.cjs +47 -3
  16. package/dist/llm/llm.cjs.map +1 -1
  17. package/dist/llm/llm.d.ts +15 -2
  18. package/dist/llm/llm.d.ts.map +1 -1
  19. package/dist/llm/llm.js +46 -3
  20. package/dist/llm/llm.js.map +1 -1
  21. package/dist/metrics/base.cjs +44 -0
  22. package/dist/metrics/base.cjs.map +1 -0
  23. package/dist/metrics/base.d.ts +96 -0
  24. package/dist/metrics/base.d.ts.map +1 -0
  25. package/dist/metrics/base.js +20 -0
  26. package/dist/metrics/base.js.map +1 -0
  27. package/dist/metrics/index.cjs +35 -0
  28. package/dist/metrics/index.cjs.map +1 -0
  29. package/dist/metrics/index.d.ts +5 -0
  30. package/dist/metrics/index.d.ts.map +1 -0
  31. package/dist/metrics/index.js +9 -0
  32. package/dist/metrics/index.js.map +1 -0
  33. package/dist/metrics/usage_collector.cjs +53 -0
  34. package/dist/metrics/usage_collector.cjs.map +1 -0
  35. package/dist/metrics/usage_collector.d.ts +14 -0
  36. package/dist/metrics/usage_collector.d.ts.map +1 -0
  37. package/dist/metrics/usage_collector.js +29 -0
  38. package/dist/metrics/usage_collector.js.map +1 -0
  39. package/dist/metrics/utils.cjs +104 -0
  40. package/dist/metrics/utils.cjs.map +1 -0
  41. package/dist/metrics/utils.d.ts +10 -0
  42. package/dist/metrics/utils.d.ts.map +1 -0
  43. package/dist/metrics/utils.js +73 -0
  44. package/dist/metrics/utils.js.map +1 -0
  45. package/dist/multimodal/multimodal_agent.cjs +34 -16
  46. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  47. package/dist/multimodal/multimodal_agent.d.ts +4 -5
  48. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  49. package/dist/multimodal/multimodal_agent.js +34 -16
  50. package/dist/multimodal/multimodal_agent.js.map +1 -1
  51. package/dist/pipeline/index.cjs +2 -0
  52. package/dist/pipeline/index.cjs.map +1 -1
  53. package/dist/pipeline/index.d.ts +1 -1
  54. package/dist/pipeline/index.d.ts.map +1 -1
  55. package/dist/pipeline/index.js +3 -1
  56. package/dist/pipeline/index.js.map +1 -1
  57. package/dist/pipeline/pipeline_agent.cjs +166 -66
  58. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  59. package/dist/pipeline/pipeline_agent.d.ts +10 -4
  60. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  61. package/dist/pipeline/pipeline_agent.js +169 -69
  62. package/dist/pipeline/pipeline_agent.js.map +1 -1
  63. package/dist/pipeline/speech_handle.cjs +49 -1
  64. package/dist/pipeline/speech_handle.cjs.map +1 -1
  65. package/dist/pipeline/speech_handle.d.ts +12 -2
  66. package/dist/pipeline/speech_handle.d.ts.map +1 -1
  67. package/dist/pipeline/speech_handle.js +50 -2
  68. package/dist/pipeline/speech_handle.js.map +1 -1
  69. package/dist/stt/index.cjs.map +1 -1
  70. package/dist/stt/index.d.ts +1 -1
  71. package/dist/stt/index.d.ts.map +1 -1
  72. package/dist/stt/index.js.map +1 -1
  73. package/dist/stt/stream_adapter.cjs +15 -5
  74. package/dist/stt/stream_adapter.cjs.map +1 -1
  75. package/dist/stt/stream_adapter.d.ts +4 -1
  76. package/dist/stt/stream_adapter.d.ts.map +1 -1
  77. package/dist/stt/stream_adapter.js +15 -5
  78. package/dist/stt/stream_adapter.js.map +1 -1
  79. package/dist/stt/stt.cjs +46 -2
  80. package/dist/stt/stt.cjs.map +1 -1
  81. package/dist/stt/stt.d.ts +25 -3
  82. package/dist/stt/stt.d.ts.map +1 -1
  83. package/dist/stt/stt.js +46 -2
  84. package/dist/stt/stt.js.map +1 -1
  85. package/dist/tts/index.cjs +4 -2
  86. package/dist/tts/index.cjs.map +1 -1
  87. package/dist/tts/index.d.ts +1 -1
  88. package/dist/tts/index.d.ts.map +1 -1
  89. package/dist/tts/index.js +3 -1
  90. package/dist/tts/index.js.map +1 -1
  91. package/dist/tts/stream_adapter.cjs +14 -3
  92. package/dist/tts/stream_adapter.cjs.map +1 -1
  93. package/dist/tts/stream_adapter.d.ts +3 -0
  94. package/dist/tts/stream_adapter.d.ts.map +1 -1
  95. package/dist/tts/stream_adapter.js +15 -4
  96. package/dist/tts/stream_adapter.js.map +1 -1
  97. package/dist/tts/tts.cjs +109 -6
  98. package/dist/tts/tts.cjs.map +1 -1
  99. package/dist/tts/tts.d.ts +24 -1
  100. package/dist/tts/tts.d.ts.map +1 -1
  101. package/dist/tts/tts.js +107 -5
  102. package/dist/tts/tts.js.map +1 -1
  103. package/dist/utils.cjs +11 -4
  104. package/dist/utils.cjs.map +1 -1
  105. package/dist/utils.d.ts.map +1 -1
  106. package/dist/utils.js +11 -4
  107. package/dist/utils.js.map +1 -1
  108. package/dist/vad.cjs +43 -2
  109. package/dist/vad.cjs.map +1 -1
  110. package/dist/vad.d.ts +21 -4
  111. package/dist/vad.d.ts.map +1 -1
  112. package/dist/vad.js +43 -2
  113. package/dist/vad.js.map +1 -1
  114. package/dist/worker.cjs +5 -2
  115. package/dist/worker.cjs.map +1 -1
  116. package/dist/worker.d.ts.map +1 -1
  117. package/dist/worker.js +5 -2
  118. package/dist/worker.js.map +1 -1
  119. package/package.json +3 -3
  120. package/src/index.ts +2 -1
  121. package/src/job.ts +3 -3
  122. package/src/llm/index.ts +2 -0
  123. package/src/llm/llm.ts +55 -3
  124. package/src/metrics/base.ts +127 -0
  125. package/src/metrics/index.ts +20 -0
  126. package/src/metrics/usage_collector.ts +40 -0
  127. package/src/metrics/utils.ts +100 -0
  128. package/src/multimodal/multimodal_agent.ts +57 -23
  129. package/src/pipeline/index.ts +1 -1
  130. package/src/pipeline/pipeline_agent.ts +208 -89
  131. package/src/pipeline/speech_handle.ts +67 -2
  132. package/src/stt/index.ts +2 -0
  133. package/src/stt/stream_adapter.ts +17 -5
  134. package/src/stt/stt.ts +67 -3
  135. package/src/tts/index.ts +2 -0
  136. package/src/tts/stream_adapter.ts +17 -4
  137. package/src/tts/tts.ts +127 -4
  138. package/src/utils.ts +12 -4
  139. package/src/vad.ts +61 -4
  140. package/src/worker.ts +7 -3
package/src/index.ts CHANGED
@@ -11,6 +11,7 @@
11
11
  */
12
12
  import * as cli from './cli.js';
13
13
  import * as llm from './llm/index.js';
14
+ import * as metrics from './metrics/index.js';
14
15
  import * as multimodal from './multimodal/index.js';
15
16
  import * as pipeline from './pipeline/index.js';
16
17
  import * as stt from './stt/index.js';
@@ -28,4 +29,4 @@ export * from './generator.js';
28
29
  export * from './audio.js';
29
30
  export * from './transcription.js';
30
31
 
31
- export { cli, stt, tts, llm, pipeline, multimodal, tokenize };
32
+ export { cli, stt, tts, llm, pipeline, multimodal, tokenize, metrics };
package/src/job.ts CHANGED
@@ -190,15 +190,15 @@ export class JobContext {
190
190
  /** @internal */
191
191
  onParticipantConnected(p: RemoteParticipant) {
192
192
  for (const callback of this.#participantEntrypoints) {
193
- if (this.#participantTasks[p.identity]?.callback == callback) {
193
+ if (this.#participantTasks[p.identity!]?.callback == callback) {
194
194
  this.#logger.warn(
195
195
  'a participant has joined before a prior prticipant task matching the same identity has finished:',
196
196
  p.identity,
197
197
  );
198
198
  }
199
199
  const result = callback(this, p);
200
- result.finally(() => delete this.#participantTasks[p.identity]);
201
- this.#participantTasks[p.identity] = { callback, result };
200
+ result.finally(() => delete this.#participantTasks[p.identity!]);
201
+ this.#participantTasks[p.identity!] = { callback, result };
202
202
  }
203
203
  }
204
204
 
package/src/llm/index.ts CHANGED
@@ -25,6 +25,8 @@ export {
25
25
  type CompletionUsage,
26
26
  type Choice,
27
27
  type ChatChunk,
28
+ type LLMCallbacks,
29
+ LLMEvent,
28
30
  LLM,
29
31
  LLMStream,
30
32
  } from './llm.js';
package/src/llm/llm.ts CHANGED
@@ -1,6 +1,9 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
5
+ import { EventEmitter } from 'node:events';
6
+ import type { LLMMetrics } from '../metrics/base.js';
4
7
  import { AsyncIterableQueue } from '../utils.js';
5
8
  import type { ChatContext, ChatRole } from './chat_context.js';
6
9
  import type { FunctionCallInfo, FunctionContext } from './function_context.js';
@@ -28,7 +31,15 @@ export interface ChatChunk {
28
31
  usage?: CompletionUsage;
29
32
  }
30
33
 
31
- export abstract class LLM {
34
+ export enum LLMEvent {
35
+ METRICS_COLLECTED,
36
+ }
37
+
38
+ export type LLMCallbacks = {
39
+ [LLMEvent.METRICS_COLLECTED]: (metrics: LLMMetrics) => void;
40
+ };
41
+
42
+ export abstract class LLM extends (EventEmitter as new () => TypedEmitter<LLMCallbacks>) {
32
43
  /**
33
44
  * Returns a {@link LLMStream} that can be used to push text and receive LLM responses.
34
45
  */
@@ -48,16 +59,56 @@ export abstract class LLM {
48
59
  }
49
60
 
50
61
  export abstract class LLMStream implements AsyncIterableIterator<ChatChunk> {
62
+ protected output = new AsyncIterableQueue<ChatChunk>();
51
63
  protected queue = new AsyncIterableQueue<ChatChunk>();
52
64
  protected closed = false;
53
65
  protected _functionCalls: FunctionCallInfo[] = [];
66
+ abstract label: string;
54
67
 
68
+ #llm: LLM;
55
69
  #chatCtx: ChatContext;
56
70
  #fncCtx?: FunctionContext;
57
71
 
58
- constructor(chatCtx: ChatContext, fncCtx?: FunctionContext) {
72
+ constructor(llm: LLM, chatCtx: ChatContext, fncCtx?: FunctionContext) {
73
+ this.#llm = llm;
59
74
  this.#chatCtx = chatCtx;
60
75
  this.#fncCtx = fncCtx;
76
+ this.monitorMetrics();
77
+ }
78
+
79
+ protected async monitorMetrics() {
80
+ const startTime = process.hrtime.bigint();
81
+ let ttft: bigint | undefined;
82
+ let requestId = '';
83
+ let usage: CompletionUsage | undefined;
84
+
85
+ for await (const ev of this.queue) {
86
+ this.output.put(ev);
87
+ requestId = ev.requestId;
88
+ if (!ttft) {
89
+ ttft = process.hrtime.bigint() - startTime;
90
+ }
91
+ if (ev.usage) {
92
+ usage = ev.usage;
93
+ }
94
+ }
95
+ this.output.close();
96
+
97
+ const duration = process.hrtime.bigint() - startTime;
98
+ const metrics: LLMMetrics = {
99
+ timestamp: Date.now(),
100
+ requestId,
101
+ ttft: Math.trunc(Number(ttft! / BigInt(1000000))),
102
+ duration: Math.trunc(Number(duration / BigInt(1000000))),
103
+ cancelled: false, // XXX(nbsp)
104
+ label: this.label,
105
+ completionTokens: usage?.completionTokens || 0,
106
+ promptTokens: usage?.promptTokens || 0,
107
+ totalTokens: usage?.totalTokens || 0,
108
+ tokensPerSecond:
109
+ (usage?.completionTokens || 0) / Math.trunc(Number(duration / BigInt(1000000000))),
110
+ };
111
+ this.#llm.emit(LLMEvent.METRICS_COLLECTED, metrics);
61
112
  }
62
113
 
63
114
  /** List of called functions from this stream. */
@@ -88,10 +139,11 @@ export abstract class LLMStream implements AsyncIterableIterator<ChatChunk> {
88
139
  }
89
140
 
90
141
  next(): Promise<IteratorResult<ChatChunk>> {
91
- return this.queue.next();
142
+ return this.output.next();
92
143
  }
93
144
 
94
145
  close() {
146
+ this.output.close();
95
147
  this.queue.close();
96
148
  this.closed = true;
97
149
  }
@@ -0,0 +1,127 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export interface LLMMetrics {
6
+ requestId: string;
7
+ timestamp: number;
8
+ ttft: number;
9
+ duration: number;
10
+ label: string;
11
+ cancelled: boolean;
12
+ completionTokens: number;
13
+ promptTokens: number;
14
+ totalTokens: number;
15
+ tokensPerSecond: number;
16
+ error?: Error;
17
+ }
18
+
19
+ export interface STTMetrics {
20
+ requestId: string;
21
+ timestamp: number;
22
+ duration: number;
23
+ label: string;
24
+ audioDuration: number;
25
+ streamed: boolean;
26
+ error?: Error;
27
+ }
28
+
29
+ export interface TTSMetrics {
30
+ requestId: string;
31
+ timestamp: number;
32
+ ttfb: number;
33
+ duration: number;
34
+ label: string;
35
+ audioDuration: number;
36
+ cancelled: boolean;
37
+ charactersCount: number;
38
+ streamed: boolean;
39
+ error?: Error;
40
+ }
41
+
42
+ export interface VADMetrics {
43
+ timestamp: number;
44
+ idleTime: number;
45
+ inferenceDurationTotal: number;
46
+ inferenceCount: number;
47
+ label: string;
48
+ }
49
+
50
+ export interface PipelineEOUMetrics {
51
+ /**
52
+ * Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
53
+ */
54
+ sequenceId: string;
55
+ /** Timestamp of when the event was recorded */
56
+ timestamp: number;
57
+ /** Amount of time between the end of speech from VAD and the decision to end the user's turn */
58
+ endOfUtteranceDelay: number;
59
+ /**
60
+ * Time taken to obtain the transcript after the end of the user's speech.
61
+ *
62
+ * @remarks
63
+ * May be 0 if the transcript was already available.
64
+ */
65
+ transcriptionDelay: number;
66
+ }
67
+
68
+ export interface PipelineLLMMetrics extends LLMMetrics {
69
+ /**
70
+ * Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
71
+ */
72
+ sequenceId: string;
73
+ }
74
+
75
+ export interface PipelineTTSMetrics extends TTSMetrics {
76
+ /**
77
+ * Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
78
+ */
79
+ sequenceId: string;
80
+ }
81
+
82
+ export type PipelineSTTMetrics = STTMetrics;
83
+ export type PipelineVADMetrics = VADMetrics;
84
+
85
+ export class MultimodalLLMError extends Error {
86
+ type?: string;
87
+ reason?: string;
88
+ code?: string;
89
+ constructor(
90
+ {
91
+ type,
92
+ reason,
93
+ code,
94
+ message,
95
+ }: { type?: string; reason?: string; code?: string; message?: string } = {},
96
+ options?: ErrorOptions,
97
+ ) {
98
+ super(message, options);
99
+ this.type = type;
100
+ this.reason = reason;
101
+ this.code = code;
102
+ }
103
+ }
104
+
105
+ export interface MultimodalLLMMetrics extends LLMMetrics {
106
+ inputTokenDetails: {
107
+ cachedTokens: number;
108
+ textTokens: number;
109
+ audioTokens: number;
110
+ };
111
+ outputTokenDetails: {
112
+ textTokens: number;
113
+ audioTokens: number;
114
+ };
115
+ }
116
+
117
+ export type AgentMetrics =
118
+ | STTMetrics
119
+ | LLMMetrics
120
+ | TTSMetrics
121
+ | VADMetrics
122
+ | PipelineSTTMetrics
123
+ | PipelineEOUMetrics
124
+ | PipelineLLMMetrics
125
+ | PipelineTTSMetrics
126
+ | PipelineVADMetrics
127
+ | MultimodalLLMMetrics;
@@ -0,0 +1,20 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export type {
6
+ AgentMetrics,
7
+ STTMetrics,
8
+ LLMMetrics,
9
+ TTSMetrics,
10
+ VADMetrics,
11
+ PipelineSTTMetrics,
12
+ PipelineEOUMetrics,
13
+ PipelineLLMMetrics,
14
+ PipelineTTSMetrics,
15
+ PipelineVADMetrics,
16
+ MultimodalLLMMetrics,
17
+ } from './base.js';
18
+ export { MultimodalLLMError } from './base.js';
19
+ export { type UsageSummary, UsageCollector } from './usage_collector.js';
20
+ export { logMetrics } from './utils.js';
@@ -0,0 +1,40 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { AgentMetrics } from './base.js';
5
+ import { isLLMMetrics, isSTTMetrics, isTTSMetrics } from './utils.js';
6
+
7
+ export interface UsageSummary {
8
+ llmPromptTokens: number;
9
+ llmCompletionTokens: number;
10
+ ttsCharactersCount: number;
11
+ sttAudioDuration: number;
12
+ }
13
+
14
+ export class UsageCollector {
15
+ #summary: UsageSummary;
16
+
17
+ constructor() {
18
+ this.#summary = {
19
+ llmPromptTokens: 0,
20
+ llmCompletionTokens: 0,
21
+ ttsCharactersCount: 0,
22
+ sttAudioDuration: 0,
23
+ };
24
+ }
25
+
26
+ collect(metrics: AgentMetrics) {
27
+ if (isLLMMetrics(metrics)) {
28
+ this.#summary.llmPromptTokens += metrics.promptTokens;
29
+ this.#summary.llmCompletionTokens += metrics.completionTokens;
30
+ } else if (isTTSMetrics(metrics)) {
31
+ this.#summary.ttsCharactersCount += metrics.charactersCount;
32
+ } else if (isSTTMetrics(metrics)) {
33
+ this.#summary.sttAudioDuration += metrics.audioDuration;
34
+ }
35
+ }
36
+
37
+ get summary(): UsageSummary {
38
+ return { ...this.#summary };
39
+ }
40
+ }
@@ -0,0 +1,100 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { log } from '../log.js';
5
+ import type {
6
+ AgentMetrics,
7
+ LLMMetrics,
8
+ PipelineEOUMetrics,
9
+ PipelineLLMMetrics,
10
+ PipelineTTSMetrics,
11
+ STTMetrics,
12
+ TTSMetrics,
13
+ VADMetrics,
14
+ } from './base.js';
15
+
16
+ export const logMetrics = (metrics: AgentMetrics) => {
17
+ const logger = log();
18
+ if (isPipelineLLMMetrics(metrics)) {
19
+ logger
20
+ .child({
21
+ sequenceId: metrics.sequenceId,
22
+ ttft: metrics.ttft,
23
+ inputTokens: metrics.promptTokens,
24
+ outputTokens: metrics.completionTokens,
25
+ tokensPerSecond: metrics.tokensPerSecond,
26
+ })
27
+ .info('Pipeline LLM metrics');
28
+ } else if (isLLMMetrics(metrics)) {
29
+ logger
30
+ .child({
31
+ ttft: metrics.ttft,
32
+ inputTokens: metrics.promptTokens,
33
+ outputTokens: metrics.completionTokens,
34
+ tokensPerSecond: metrics.tokensPerSecond,
35
+ })
36
+ .info('LLM metrics');
37
+ } else if (isPipelineTTSMetrics(metrics)) {
38
+ logger
39
+ .child({
40
+ sequenceId: metrics.sequenceId,
41
+ ttfb: metrics.ttfb,
42
+ audioDuration: metrics.audioDuration,
43
+ })
44
+ .info('Pipeline TTS metrics');
45
+ } else if (isTTSMetrics(metrics)) {
46
+ logger
47
+ .child({
48
+ ttfb: metrics.ttfb,
49
+ audioDuration: metrics.audioDuration,
50
+ })
51
+ .info('TTS metrics');
52
+ } else if (isPipelineEOUMetrics(metrics)) {
53
+ logger
54
+ .child({
55
+ sequenceId: metrics.sequenceId,
56
+ endOfUtteranceDelay: metrics.endOfUtteranceDelay,
57
+ transcriptionDelay: metrics.transcriptionDelay,
58
+ })
59
+ .info('Pipeline EOU metrics');
60
+ } else if (isSTTMetrics(metrics)) {
61
+ logger
62
+ .child({
63
+ audioDuration: metrics.audioDuration,
64
+ })
65
+ .info('STT metrics');
66
+ }
67
+ };
68
+
69
+ export const isLLMMetrics = (metrics: AgentMetrics): metrics is LLMMetrics => {
70
+ return !!(metrics as LLMMetrics).ttft;
71
+ };
72
+
73
+ export const isPipelineLLMMetrics = (metrics: AgentMetrics): metrics is PipelineLLMMetrics => {
74
+ return isLLMMetrics(metrics) && !!(metrics as PipelineLLMMetrics).sequenceId;
75
+ };
76
+
77
+ export const isVADMetrics = (metrics: AgentMetrics): metrics is VADMetrics => {
78
+ return !!(metrics as VADMetrics).inferenceCount;
79
+ };
80
+
81
+ export const isPipelineEOUMetrics = (metrics: AgentMetrics): metrics is PipelineEOUMetrics => {
82
+ return !!(metrics as PipelineEOUMetrics).endOfUtteranceDelay;
83
+ };
84
+
85
+ export const isTTSMetrics = (metrics: AgentMetrics): metrics is TTSMetrics => {
86
+ return !!(metrics as TTSMetrics).ttfb;
87
+ };
88
+
89
+ export const isPipelineTTSMetrics = (metrics: AgentMetrics): metrics is PipelineTTSMetrics => {
90
+ return isTTSMetrics(metrics) && !!(metrics as PipelineTTSMetrics).sequenceId;
91
+ };
92
+
93
+ export const isSTTMetrics = (metrics: AgentMetrics): metrics is STTMetrics => {
94
+ return !(
95
+ isLLMMetrics(metrics) ||
96
+ isVADMetrics(metrics) ||
97
+ isPipelineEOUMetrics(metrics) ||
98
+ isTTSMetrics(metrics)
99
+ );
100
+ };
@@ -21,6 +21,7 @@ import { EventEmitter } from 'node:events';
21
21
  import { AudioByteStream } from '../audio.js';
22
22
  import * as llm from '../llm/index.js';
23
23
  import { log } from '../log.js';
24
+ import type { MultimodalLLMMetrics } from '../metrics/base.js';
24
25
  import { BasicTranscriptionForwarder } from '../transcription.js';
25
26
  import { findMicroTrackId } from '../utils.js';
26
27
  import { AgentPlayout, type PlayoutHandle } from './agent_playout.js';
@@ -35,6 +36,7 @@ export abstract class RealtimeSession extends EventEmitter {
35
36
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
36
37
  abstract inputAudioBuffer: any; // openai.realtime.InputAudioBuffer
37
38
  abstract fncCtx: llm.FunctionContext | undefined;
39
+ abstract recoverFromTextResponse(itemId: string): void;
38
40
  }
39
41
 
40
42
  /**
@@ -60,21 +62,27 @@ export class MultimodalAgent extends EventEmitter {
60
62
  room: Room | null = null;
61
63
  linkedParticipant: RemoteParticipant | null = null;
62
64
  subscribedTrack: RemoteAudioTrack | null = null;
63
- readMicroTask: { promise: Promise<void>; cancel: () => void } | null = null;
65
+ readMicroTask: Promise<void> | null = null;
66
+
67
+ #textResponseRetries = 0;
68
+ #maxTextResponseRetries: number;
64
69
 
65
70
  constructor({
66
71
  model,
67
72
  chatCtx,
68
73
  fncCtx,
74
+ maxTextResponseRetries = 5,
69
75
  }: {
70
76
  model: RealtimeModel;
71
77
  chatCtx?: llm.ChatContext;
72
78
  fncCtx?: llm.FunctionContext;
79
+ maxTextResponseRetries?: number;
73
80
  }) {
74
81
  super();
75
82
  this.model = model;
76
83
  this.#chatCtx = chatCtx;
77
84
  this.#fncCtx = fncCtx;
85
+ this.#maxTextResponseRetries = maxTextResponseRetries;
78
86
  }
79
87
 
80
88
  #participant: RemoteParticipant | string | null = null;
@@ -145,7 +153,7 @@ export class MultimodalAgent extends EventEmitter {
145
153
  if (this.linkedParticipant) {
146
154
  return;
147
155
  }
148
- this.#linkParticipant(participant.identity);
156
+ this.#linkParticipant(participant.identity!);
149
157
  });
150
158
  room.on(
151
159
  RoomEvent.TrackPublished,
@@ -219,12 +227,12 @@ export class MultimodalAgent extends EventEmitter {
219
227
  if (typeof participant === 'string') {
220
228
  this.#linkParticipant(participant);
221
229
  } else {
222
- this.#linkParticipant(participant.identity);
230
+ this.#linkParticipant(participant.identity!);
223
231
  }
224
232
  } else {
225
233
  // No participant specified, try to find the first participant in the room
226
234
  for (const participant of room.remoteParticipants.values()) {
227
- this.#linkParticipant(participant.identity);
235
+ this.#linkParticipant(participant.identity!);
228
236
  break;
229
237
  }
230
238
  }
@@ -235,9 +243,11 @@ export class MultimodalAgent extends EventEmitter {
235
243
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
236
244
  this.#session.on('response_content_added', (message: any) => {
237
245
  // openai.realtime.RealtimeContent
246
+ if (message.contentType === 'text') return;
247
+
238
248
  const trFwd = new BasicTranscriptionForwarder(
239
249
  this.room!,
240
- this.room!.localParticipant!.identity,
250
+ this.room!.localParticipant!.identity!,
241
251
  this.#getLocalTrackSid()!,
242
252
  message.responseId,
243
253
  );
@@ -252,6 +262,36 @@ export class MultimodalAgent extends EventEmitter {
252
262
  this.#playingHandle = handle;
253
263
  });
254
264
 
265
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
266
+ this.#session.on('response_content_done', (message: any) => {
267
+ // openai.realtime.RealtimeContent
268
+ if (message.contentType === 'text') {
269
+ if (this.#textResponseRetries >= this.#maxTextResponseRetries) {
270
+ throw new Error(
271
+ 'The OpenAI Realtime API returned a text response ' +
272
+ `after ${this.#maxTextResponseRetries} retries. ` +
273
+ 'Please try to reduce the number of text system or ' +
274
+ 'assistant messages in the chat context.',
275
+ );
276
+ }
277
+
278
+ this.#textResponseRetries++;
279
+ this.#logger
280
+ .child({
281
+ itemId: message.itemId,
282
+ text: message.text,
283
+ retries: this.#textResponseRetries,
284
+ })
285
+ .warn(
286
+ 'The OpenAI Realtime API returned a text response instead of audio. ' +
287
+ 'Attempting to recover to audio mode...',
288
+ );
289
+ this.#session!.recoverFromTextResponse(message.itemId);
290
+ } else {
291
+ this.#textResponseRetries = 0;
292
+ }
293
+ });
294
+
255
295
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
256
296
  this.#session.on('input_speech_committed', (ev: any) => {
257
297
  // openai.realtime.InputSpeechCommittedEvent
@@ -284,6 +324,7 @@ export class MultimodalAgent extends EventEmitter {
284
324
  });
285
325
 
286
326
  this.#session.on('input_speech_started', (ev: any) => {
327
+ this.emit('user_started_speaking');
287
328
  if (this.#playingHandle && !this.#playingHandle.done) {
288
329
  this.#playingHandle.interrupt();
289
330
 
@@ -326,6 +367,10 @@ export class MultimodalAgent extends EventEmitter {
326
367
  this.#updateState();
327
368
  });
328
369
 
370
+ this.#session.on('metrics_collected', (metrics: MultimodalLLMMetrics) => {
371
+ this.emit('metrics_collected', metrics);
372
+ });
373
+
329
374
  resolve(this.#session);
330
375
  });
331
376
  }
@@ -404,27 +449,16 @@ export class MultimodalAgent extends EventEmitter {
404
449
  };
405
450
  this.subscribedTrack = track;
406
451
 
407
- if (this.readMicroTask) {
408
- this.readMicroTask.cancel();
409
- }
410
-
411
- let cancel: () => void;
412
- this.readMicroTask = {
413
- promise: new Promise<void>((resolve, reject) => {
414
- cancel = () => {
415
- reject(new Error('Task cancelled'));
416
- };
417
- readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))
418
- .then(resolve)
419
- .catch(reject);
420
- }),
421
- cancel: () => cancel(),
422
- };
452
+ this.readMicroTask = new Promise<void>((resolve, reject) => {
453
+ readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))
454
+ .then(resolve)
455
+ .catch(reject);
456
+ });
423
457
  }
424
458
 
425
459
  #getLocalTrackSid(): string | null {
426
460
  if (!this.#localTrackSid && this.room && this.room.localParticipant) {
427
- this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant?.identity);
461
+ this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant!.identity!);
428
462
  }
429
463
  return this.#localTrackSid;
430
464
  }
@@ -475,7 +509,7 @@ export class MultimodalAgent extends EventEmitter {
475
509
 
476
510
  #setState(state: AgentState) {
477
511
  if (this.room?.isConnected && this.room.localParticipant) {
478
- const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
512
+ const currentState = this.room.localParticipant.attributes![AGENT_STATE_ATTRIBUTE];
479
513
  if (currentState !== state) {
480
514
  this.room.localParticipant.setAttributes({
481
515
  [AGENT_STATE_ATTRIBUTE]: state,
@@ -7,9 +7,9 @@ export {
7
7
  type BeforeTTSCallback,
8
8
  type BeforeLLMCallback,
9
9
  type VPACallbacks,
10
- type AgentCallContext,
11
10
  type AgentTranscriptionOptions,
12
11
  type VPAOptions,
13
12
  VPAEvent,
14
13
  VoicePipelineAgent,
14
+ AgentCallContext,
15
15
  } from './pipeline_agent.js';