@livekit/agents 0.5.2 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +47 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.ts +15 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +46 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/metrics/base.cjs +44 -0
- package/dist/metrics/base.cjs.map +1 -0
- package/dist/metrics/base.d.ts +96 -0
- package/dist/metrics/base.d.ts.map +1 -0
- package/dist/metrics/base.js +20 -0
- package/dist/metrics/base.js.map +1 -0
- package/dist/metrics/index.cjs +35 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.ts +5 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/index.js +9 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +53 -0
- package/dist/metrics/usage_collector.cjs.map +1 -0
- package/dist/metrics/usage_collector.d.ts +14 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -0
- package/dist/metrics/usage_collector.js +29 -0
- package/dist/metrics/usage_collector.js.map +1 -0
- package/dist/metrics/utils.cjs +104 -0
- package/dist/metrics/utils.cjs.map +1 -0
- package/dist/metrics/utils.d.ts +10 -0
- package/dist/metrics/utils.d.ts.map +1 -0
- package/dist/metrics/utils.js +73 -0
- package/dist/metrics/utils.js.map +1 -0
- package/dist/multimodal/multimodal_agent.cjs +34 -16
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +4 -5
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +34 -16
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/index.cjs +2 -0
- package/dist/pipeline/index.cjs.map +1 -1
- package/dist/pipeline/index.d.ts +1 -1
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +3 -1
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +166 -66
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +10 -4
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +169 -69
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +49 -1
- package/dist/pipeline/speech_handle.cjs.map +1 -1
- package/dist/pipeline/speech_handle.d.ts +12 -2
- package/dist/pipeline/speech_handle.d.ts.map +1 -1
- package/dist/pipeline/speech_handle.js +50 -2
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/stt/index.cjs.map +1 -1
- package/dist/stt/index.d.ts +1 -1
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -5
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts +4 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -5
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +46 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts +25 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +46 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/index.cjs +4 -2
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.ts +1 -1
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +3 -1
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +14 -3
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.ts +3 -0
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +15 -4
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +109 -6
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts +24 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +107 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +11 -4
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +11 -4
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +43 -2
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.ts +21 -4
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +43 -2
- package/dist/vad.js.map +1 -1
- package/dist/worker.cjs +5 -2
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +5 -2
- package/dist/worker.js.map +1 -1
- package/package.json +3 -3
- package/src/index.ts +2 -1
- package/src/job.ts +3 -3
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +55 -3
- package/src/metrics/base.ts +127 -0
- package/src/metrics/index.ts +20 -0
- package/src/metrics/usage_collector.ts +40 -0
- package/src/metrics/utils.ts +100 -0
- package/src/multimodal/multimodal_agent.ts +57 -23
- package/src/pipeline/index.ts +1 -1
- package/src/pipeline/pipeline_agent.ts +208 -89
- package/src/pipeline/speech_handle.ts +67 -2
- package/src/stt/index.ts +2 -0
- package/src/stt/stream_adapter.ts +17 -5
- package/src/stt/stt.ts +67 -3
- package/src/tts/index.ts +2 -0
- package/src/tts/stream_adapter.ts +17 -4
- package/src/tts/tts.ts +127 -4
- package/src/utils.ts +12 -4
- package/src/vad.ts +61 -4
- package/src/worker.ts +7 -3
package/src/index.ts
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import * as cli from './cli.js';
|
|
13
13
|
import * as llm from './llm/index.js';
|
|
14
|
+
import * as metrics from './metrics/index.js';
|
|
14
15
|
import * as multimodal from './multimodal/index.js';
|
|
15
16
|
import * as pipeline from './pipeline/index.js';
|
|
16
17
|
import * as stt from './stt/index.js';
|
|
@@ -28,4 +29,4 @@ export * from './generator.js';
|
|
|
28
29
|
export * from './audio.js';
|
|
29
30
|
export * from './transcription.js';
|
|
30
31
|
|
|
31
|
-
export { cli, stt, tts, llm, pipeline, multimodal, tokenize };
|
|
32
|
+
export { cli, stt, tts, llm, pipeline, multimodal, tokenize, metrics };
|
package/src/job.ts
CHANGED
|
@@ -190,15 +190,15 @@ export class JobContext {
|
|
|
190
190
|
/** @internal */
|
|
191
191
|
onParticipantConnected(p: RemoteParticipant) {
|
|
192
192
|
for (const callback of this.#participantEntrypoints) {
|
|
193
|
-
if (this.#participantTasks[p.identity]?.callback == callback) {
|
|
193
|
+
if (this.#participantTasks[p.identity!]?.callback == callback) {
|
|
194
194
|
this.#logger.warn(
|
|
195
195
|
'a participant has joined before a prior prticipant task matching the same identity has finished:',
|
|
196
196
|
p.identity,
|
|
197
197
|
);
|
|
198
198
|
}
|
|
199
199
|
const result = callback(this, p);
|
|
200
|
-
result.finally(() => delete this.#participantTasks[p.identity]);
|
|
201
|
-
this.#participantTasks[p.identity] = { callback, result };
|
|
200
|
+
result.finally(() => delete this.#participantTasks[p.identity!]);
|
|
201
|
+
this.#participantTasks[p.identity!] = { callback, result };
|
|
202
202
|
}
|
|
203
203
|
}
|
|
204
204
|
|
package/src/llm/index.ts
CHANGED
package/src/llm/llm.ts
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
5
|
+
import { EventEmitter } from 'node:events';
|
|
6
|
+
import type { LLMMetrics } from '../metrics/base.js';
|
|
4
7
|
import { AsyncIterableQueue } from '../utils.js';
|
|
5
8
|
import type { ChatContext, ChatRole } from './chat_context.js';
|
|
6
9
|
import type { FunctionCallInfo, FunctionContext } from './function_context.js';
|
|
@@ -28,7 +31,15 @@ export interface ChatChunk {
|
|
|
28
31
|
usage?: CompletionUsage;
|
|
29
32
|
}
|
|
30
33
|
|
|
31
|
-
export
|
|
34
|
+
export enum LLMEvent {
|
|
35
|
+
METRICS_COLLECTED,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export type LLMCallbacks = {
|
|
39
|
+
[LLMEvent.METRICS_COLLECTED]: (metrics: LLMMetrics) => void;
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
export abstract class LLM extends (EventEmitter as new () => TypedEmitter<LLMCallbacks>) {
|
|
32
43
|
/**
|
|
33
44
|
* Returns a {@link LLMStream} that can be used to push text and receive LLM responses.
|
|
34
45
|
*/
|
|
@@ -48,16 +59,56 @@ export abstract class LLM {
|
|
|
48
59
|
}
|
|
49
60
|
|
|
50
61
|
export abstract class LLMStream implements AsyncIterableIterator<ChatChunk> {
|
|
62
|
+
protected output = new AsyncIterableQueue<ChatChunk>();
|
|
51
63
|
protected queue = new AsyncIterableQueue<ChatChunk>();
|
|
52
64
|
protected closed = false;
|
|
53
65
|
protected _functionCalls: FunctionCallInfo[] = [];
|
|
66
|
+
abstract label: string;
|
|
54
67
|
|
|
68
|
+
#llm: LLM;
|
|
55
69
|
#chatCtx: ChatContext;
|
|
56
70
|
#fncCtx?: FunctionContext;
|
|
57
71
|
|
|
58
|
-
constructor(chatCtx: ChatContext, fncCtx?: FunctionContext) {
|
|
72
|
+
constructor(llm: LLM, chatCtx: ChatContext, fncCtx?: FunctionContext) {
|
|
73
|
+
this.#llm = llm;
|
|
59
74
|
this.#chatCtx = chatCtx;
|
|
60
75
|
this.#fncCtx = fncCtx;
|
|
76
|
+
this.monitorMetrics();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
protected async monitorMetrics() {
|
|
80
|
+
const startTime = process.hrtime.bigint();
|
|
81
|
+
let ttft: bigint | undefined;
|
|
82
|
+
let requestId = '';
|
|
83
|
+
let usage: CompletionUsage | undefined;
|
|
84
|
+
|
|
85
|
+
for await (const ev of this.queue) {
|
|
86
|
+
this.output.put(ev);
|
|
87
|
+
requestId = ev.requestId;
|
|
88
|
+
if (!ttft) {
|
|
89
|
+
ttft = process.hrtime.bigint() - startTime;
|
|
90
|
+
}
|
|
91
|
+
if (ev.usage) {
|
|
92
|
+
usage = ev.usage;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
this.output.close();
|
|
96
|
+
|
|
97
|
+
const duration = process.hrtime.bigint() - startTime;
|
|
98
|
+
const metrics: LLMMetrics = {
|
|
99
|
+
timestamp: Date.now(),
|
|
100
|
+
requestId,
|
|
101
|
+
ttft: Math.trunc(Number(ttft! / BigInt(1000000))),
|
|
102
|
+
duration: Math.trunc(Number(duration / BigInt(1000000))),
|
|
103
|
+
cancelled: false, // XXX(nbsp)
|
|
104
|
+
label: this.label,
|
|
105
|
+
completionTokens: usage?.completionTokens || 0,
|
|
106
|
+
promptTokens: usage?.promptTokens || 0,
|
|
107
|
+
totalTokens: usage?.totalTokens || 0,
|
|
108
|
+
tokensPerSecond:
|
|
109
|
+
(usage?.completionTokens || 0) / Math.trunc(Number(duration / BigInt(1000000000))),
|
|
110
|
+
};
|
|
111
|
+
this.#llm.emit(LLMEvent.METRICS_COLLECTED, metrics);
|
|
61
112
|
}
|
|
62
113
|
|
|
63
114
|
/** List of called functions from this stream. */
|
|
@@ -88,10 +139,11 @@ export abstract class LLMStream implements AsyncIterableIterator<ChatChunk> {
|
|
|
88
139
|
}
|
|
89
140
|
|
|
90
141
|
next(): Promise<IteratorResult<ChatChunk>> {
|
|
91
|
-
return this.
|
|
142
|
+
return this.output.next();
|
|
92
143
|
}
|
|
93
144
|
|
|
94
145
|
close() {
|
|
146
|
+
this.output.close();
|
|
95
147
|
this.queue.close();
|
|
96
148
|
this.closed = true;
|
|
97
149
|
}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
export interface LLMMetrics {
|
|
6
|
+
requestId: string;
|
|
7
|
+
timestamp: number;
|
|
8
|
+
ttft: number;
|
|
9
|
+
duration: number;
|
|
10
|
+
label: string;
|
|
11
|
+
cancelled: boolean;
|
|
12
|
+
completionTokens: number;
|
|
13
|
+
promptTokens: number;
|
|
14
|
+
totalTokens: number;
|
|
15
|
+
tokensPerSecond: number;
|
|
16
|
+
error?: Error;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface STTMetrics {
|
|
20
|
+
requestId: string;
|
|
21
|
+
timestamp: number;
|
|
22
|
+
duration: number;
|
|
23
|
+
label: string;
|
|
24
|
+
audioDuration: number;
|
|
25
|
+
streamed: boolean;
|
|
26
|
+
error?: Error;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface TTSMetrics {
|
|
30
|
+
requestId: string;
|
|
31
|
+
timestamp: number;
|
|
32
|
+
ttfb: number;
|
|
33
|
+
duration: number;
|
|
34
|
+
label: string;
|
|
35
|
+
audioDuration: number;
|
|
36
|
+
cancelled: boolean;
|
|
37
|
+
charactersCount: number;
|
|
38
|
+
streamed: boolean;
|
|
39
|
+
error?: Error;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface VADMetrics {
|
|
43
|
+
timestamp: number;
|
|
44
|
+
idleTime: number;
|
|
45
|
+
inferenceDurationTotal: number;
|
|
46
|
+
inferenceCount: number;
|
|
47
|
+
label: string;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface PipelineEOUMetrics {
|
|
51
|
+
/**
|
|
52
|
+
* Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
|
|
53
|
+
*/
|
|
54
|
+
sequenceId: string;
|
|
55
|
+
/** Timestamp of when the event was recorded */
|
|
56
|
+
timestamp: number;
|
|
57
|
+
/** Amount of time between the end of speech from VAD and the decision to end the user's turn */
|
|
58
|
+
endOfUtteranceDelay: number;
|
|
59
|
+
/**
|
|
60
|
+
* Time taken to obtain the transcript after the end of the user's speech.
|
|
61
|
+
*
|
|
62
|
+
* @remarks
|
|
63
|
+
* May be 0 if the transcript was already available.
|
|
64
|
+
*/
|
|
65
|
+
transcriptionDelay: number;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export interface PipelineLLMMetrics extends LLMMetrics {
|
|
69
|
+
/**
|
|
70
|
+
* Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
|
|
71
|
+
*/
|
|
72
|
+
sequenceId: string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export interface PipelineTTSMetrics extends TTSMetrics {
|
|
76
|
+
/**
|
|
77
|
+
* Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
|
|
78
|
+
*/
|
|
79
|
+
sequenceId: string;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export type PipelineSTTMetrics = STTMetrics;
|
|
83
|
+
export type PipelineVADMetrics = VADMetrics;
|
|
84
|
+
|
|
85
|
+
export class MultimodalLLMError extends Error {
|
|
86
|
+
type?: string;
|
|
87
|
+
reason?: string;
|
|
88
|
+
code?: string;
|
|
89
|
+
constructor(
|
|
90
|
+
{
|
|
91
|
+
type,
|
|
92
|
+
reason,
|
|
93
|
+
code,
|
|
94
|
+
message,
|
|
95
|
+
}: { type?: string; reason?: string; code?: string; message?: string } = {},
|
|
96
|
+
options?: ErrorOptions,
|
|
97
|
+
) {
|
|
98
|
+
super(message, options);
|
|
99
|
+
this.type = type;
|
|
100
|
+
this.reason = reason;
|
|
101
|
+
this.code = code;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export interface MultimodalLLMMetrics extends LLMMetrics {
|
|
106
|
+
inputTokenDetails: {
|
|
107
|
+
cachedTokens: number;
|
|
108
|
+
textTokens: number;
|
|
109
|
+
audioTokens: number;
|
|
110
|
+
};
|
|
111
|
+
outputTokenDetails: {
|
|
112
|
+
textTokens: number;
|
|
113
|
+
audioTokens: number;
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export type AgentMetrics =
|
|
118
|
+
| STTMetrics
|
|
119
|
+
| LLMMetrics
|
|
120
|
+
| TTSMetrics
|
|
121
|
+
| VADMetrics
|
|
122
|
+
| PipelineSTTMetrics
|
|
123
|
+
| PipelineEOUMetrics
|
|
124
|
+
| PipelineLLMMetrics
|
|
125
|
+
| PipelineTTSMetrics
|
|
126
|
+
| PipelineVADMetrics
|
|
127
|
+
| MultimodalLLMMetrics;
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
export type {
|
|
6
|
+
AgentMetrics,
|
|
7
|
+
STTMetrics,
|
|
8
|
+
LLMMetrics,
|
|
9
|
+
TTSMetrics,
|
|
10
|
+
VADMetrics,
|
|
11
|
+
PipelineSTTMetrics,
|
|
12
|
+
PipelineEOUMetrics,
|
|
13
|
+
PipelineLLMMetrics,
|
|
14
|
+
PipelineTTSMetrics,
|
|
15
|
+
PipelineVADMetrics,
|
|
16
|
+
MultimodalLLMMetrics,
|
|
17
|
+
} from './base.js';
|
|
18
|
+
export { MultimodalLLMError } from './base.js';
|
|
19
|
+
export { type UsageSummary, UsageCollector } from './usage_collector.js';
|
|
20
|
+
export { logMetrics } from './utils.js';
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { AgentMetrics } from './base.js';
|
|
5
|
+
import { isLLMMetrics, isSTTMetrics, isTTSMetrics } from './utils.js';
|
|
6
|
+
|
|
7
|
+
export interface UsageSummary {
|
|
8
|
+
llmPromptTokens: number;
|
|
9
|
+
llmCompletionTokens: number;
|
|
10
|
+
ttsCharactersCount: number;
|
|
11
|
+
sttAudioDuration: number;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export class UsageCollector {
|
|
15
|
+
#summary: UsageSummary;
|
|
16
|
+
|
|
17
|
+
constructor() {
|
|
18
|
+
this.#summary = {
|
|
19
|
+
llmPromptTokens: 0,
|
|
20
|
+
llmCompletionTokens: 0,
|
|
21
|
+
ttsCharactersCount: 0,
|
|
22
|
+
sttAudioDuration: 0,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
collect(metrics: AgentMetrics) {
|
|
27
|
+
if (isLLMMetrics(metrics)) {
|
|
28
|
+
this.#summary.llmPromptTokens += metrics.promptTokens;
|
|
29
|
+
this.#summary.llmCompletionTokens += metrics.completionTokens;
|
|
30
|
+
} else if (isTTSMetrics(metrics)) {
|
|
31
|
+
this.#summary.ttsCharactersCount += metrics.charactersCount;
|
|
32
|
+
} else if (isSTTMetrics(metrics)) {
|
|
33
|
+
this.#summary.sttAudioDuration += metrics.audioDuration;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
get summary(): UsageSummary {
|
|
38
|
+
return { ...this.#summary };
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { log } from '../log.js';
|
|
5
|
+
import type {
|
|
6
|
+
AgentMetrics,
|
|
7
|
+
LLMMetrics,
|
|
8
|
+
PipelineEOUMetrics,
|
|
9
|
+
PipelineLLMMetrics,
|
|
10
|
+
PipelineTTSMetrics,
|
|
11
|
+
STTMetrics,
|
|
12
|
+
TTSMetrics,
|
|
13
|
+
VADMetrics,
|
|
14
|
+
} from './base.js';
|
|
15
|
+
|
|
16
|
+
export const logMetrics = (metrics: AgentMetrics) => {
|
|
17
|
+
const logger = log();
|
|
18
|
+
if (isPipelineLLMMetrics(metrics)) {
|
|
19
|
+
logger
|
|
20
|
+
.child({
|
|
21
|
+
sequenceId: metrics.sequenceId,
|
|
22
|
+
ttft: metrics.ttft,
|
|
23
|
+
inputTokens: metrics.promptTokens,
|
|
24
|
+
outputTokens: metrics.completionTokens,
|
|
25
|
+
tokensPerSecond: metrics.tokensPerSecond,
|
|
26
|
+
})
|
|
27
|
+
.info('Pipeline LLM metrics');
|
|
28
|
+
} else if (isLLMMetrics(metrics)) {
|
|
29
|
+
logger
|
|
30
|
+
.child({
|
|
31
|
+
ttft: metrics.ttft,
|
|
32
|
+
inputTokens: metrics.promptTokens,
|
|
33
|
+
outputTokens: metrics.completionTokens,
|
|
34
|
+
tokensPerSecond: metrics.tokensPerSecond,
|
|
35
|
+
})
|
|
36
|
+
.info('LLM metrics');
|
|
37
|
+
} else if (isPipelineTTSMetrics(metrics)) {
|
|
38
|
+
logger
|
|
39
|
+
.child({
|
|
40
|
+
sequenceId: metrics.sequenceId,
|
|
41
|
+
ttfb: metrics.ttfb,
|
|
42
|
+
audioDuration: metrics.audioDuration,
|
|
43
|
+
})
|
|
44
|
+
.info('Pipeline TTS metrics');
|
|
45
|
+
} else if (isTTSMetrics(metrics)) {
|
|
46
|
+
logger
|
|
47
|
+
.child({
|
|
48
|
+
ttfb: metrics.ttfb,
|
|
49
|
+
audioDuration: metrics.audioDuration,
|
|
50
|
+
})
|
|
51
|
+
.info('TTS metrics');
|
|
52
|
+
} else if (isPipelineEOUMetrics(metrics)) {
|
|
53
|
+
logger
|
|
54
|
+
.child({
|
|
55
|
+
sequenceId: metrics.sequenceId,
|
|
56
|
+
endOfUtteranceDelay: metrics.endOfUtteranceDelay,
|
|
57
|
+
transcriptionDelay: metrics.transcriptionDelay,
|
|
58
|
+
})
|
|
59
|
+
.info('Pipeline EOU metrics');
|
|
60
|
+
} else if (isSTTMetrics(metrics)) {
|
|
61
|
+
logger
|
|
62
|
+
.child({
|
|
63
|
+
audioDuration: metrics.audioDuration,
|
|
64
|
+
})
|
|
65
|
+
.info('STT metrics');
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
export const isLLMMetrics = (metrics: AgentMetrics): metrics is LLMMetrics => {
|
|
70
|
+
return !!(metrics as LLMMetrics).ttft;
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
export const isPipelineLLMMetrics = (metrics: AgentMetrics): metrics is PipelineLLMMetrics => {
|
|
74
|
+
return isLLMMetrics(metrics) && !!(metrics as PipelineLLMMetrics).sequenceId;
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
export const isVADMetrics = (metrics: AgentMetrics): metrics is VADMetrics => {
|
|
78
|
+
return !!(metrics as VADMetrics).inferenceCount;
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
export const isPipelineEOUMetrics = (metrics: AgentMetrics): metrics is PipelineEOUMetrics => {
|
|
82
|
+
return !!(metrics as PipelineEOUMetrics).endOfUtteranceDelay;
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
export const isTTSMetrics = (metrics: AgentMetrics): metrics is TTSMetrics => {
|
|
86
|
+
return !!(metrics as TTSMetrics).ttfb;
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
export const isPipelineTTSMetrics = (metrics: AgentMetrics): metrics is PipelineTTSMetrics => {
|
|
90
|
+
return isTTSMetrics(metrics) && !!(metrics as PipelineTTSMetrics).sequenceId;
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
export const isSTTMetrics = (metrics: AgentMetrics): metrics is STTMetrics => {
|
|
94
|
+
return !(
|
|
95
|
+
isLLMMetrics(metrics) ||
|
|
96
|
+
isVADMetrics(metrics) ||
|
|
97
|
+
isPipelineEOUMetrics(metrics) ||
|
|
98
|
+
isTTSMetrics(metrics)
|
|
99
|
+
);
|
|
100
|
+
};
|
|
@@ -21,6 +21,7 @@ import { EventEmitter } from 'node:events';
|
|
|
21
21
|
import { AudioByteStream } from '../audio.js';
|
|
22
22
|
import * as llm from '../llm/index.js';
|
|
23
23
|
import { log } from '../log.js';
|
|
24
|
+
import type { MultimodalLLMMetrics } from '../metrics/base.js';
|
|
24
25
|
import { BasicTranscriptionForwarder } from '../transcription.js';
|
|
25
26
|
import { findMicroTrackId } from '../utils.js';
|
|
26
27
|
import { AgentPlayout, type PlayoutHandle } from './agent_playout.js';
|
|
@@ -35,6 +36,7 @@ export abstract class RealtimeSession extends EventEmitter {
|
|
|
35
36
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
36
37
|
abstract inputAudioBuffer: any; // openai.realtime.InputAudioBuffer
|
|
37
38
|
abstract fncCtx: llm.FunctionContext | undefined;
|
|
39
|
+
abstract recoverFromTextResponse(itemId: string): void;
|
|
38
40
|
}
|
|
39
41
|
|
|
40
42
|
/**
|
|
@@ -60,21 +62,27 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
60
62
|
room: Room | null = null;
|
|
61
63
|
linkedParticipant: RemoteParticipant | null = null;
|
|
62
64
|
subscribedTrack: RemoteAudioTrack | null = null;
|
|
63
|
-
readMicroTask:
|
|
65
|
+
readMicroTask: Promise<void> | null = null;
|
|
66
|
+
|
|
67
|
+
#textResponseRetries = 0;
|
|
68
|
+
#maxTextResponseRetries: number;
|
|
64
69
|
|
|
65
70
|
constructor({
|
|
66
71
|
model,
|
|
67
72
|
chatCtx,
|
|
68
73
|
fncCtx,
|
|
74
|
+
maxTextResponseRetries = 5,
|
|
69
75
|
}: {
|
|
70
76
|
model: RealtimeModel;
|
|
71
77
|
chatCtx?: llm.ChatContext;
|
|
72
78
|
fncCtx?: llm.FunctionContext;
|
|
79
|
+
maxTextResponseRetries?: number;
|
|
73
80
|
}) {
|
|
74
81
|
super();
|
|
75
82
|
this.model = model;
|
|
76
83
|
this.#chatCtx = chatCtx;
|
|
77
84
|
this.#fncCtx = fncCtx;
|
|
85
|
+
this.#maxTextResponseRetries = maxTextResponseRetries;
|
|
78
86
|
}
|
|
79
87
|
|
|
80
88
|
#participant: RemoteParticipant | string | null = null;
|
|
@@ -145,7 +153,7 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
145
153
|
if (this.linkedParticipant) {
|
|
146
154
|
return;
|
|
147
155
|
}
|
|
148
|
-
this.#linkParticipant(participant.identity);
|
|
156
|
+
this.#linkParticipant(participant.identity!);
|
|
149
157
|
});
|
|
150
158
|
room.on(
|
|
151
159
|
RoomEvent.TrackPublished,
|
|
@@ -219,12 +227,12 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
219
227
|
if (typeof participant === 'string') {
|
|
220
228
|
this.#linkParticipant(participant);
|
|
221
229
|
} else {
|
|
222
|
-
this.#linkParticipant(participant.identity);
|
|
230
|
+
this.#linkParticipant(participant.identity!);
|
|
223
231
|
}
|
|
224
232
|
} else {
|
|
225
233
|
// No participant specified, try to find the first participant in the room
|
|
226
234
|
for (const participant of room.remoteParticipants.values()) {
|
|
227
|
-
this.#linkParticipant(participant.identity);
|
|
235
|
+
this.#linkParticipant(participant.identity!);
|
|
228
236
|
break;
|
|
229
237
|
}
|
|
230
238
|
}
|
|
@@ -235,9 +243,11 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
235
243
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
236
244
|
this.#session.on('response_content_added', (message: any) => {
|
|
237
245
|
// openai.realtime.RealtimeContent
|
|
246
|
+
if (message.contentType === 'text') return;
|
|
247
|
+
|
|
238
248
|
const trFwd = new BasicTranscriptionForwarder(
|
|
239
249
|
this.room!,
|
|
240
|
-
this.room!.localParticipant!.identity
|
|
250
|
+
this.room!.localParticipant!.identity!,
|
|
241
251
|
this.#getLocalTrackSid()!,
|
|
242
252
|
message.responseId,
|
|
243
253
|
);
|
|
@@ -252,6 +262,36 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
252
262
|
this.#playingHandle = handle;
|
|
253
263
|
});
|
|
254
264
|
|
|
265
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
266
|
+
this.#session.on('response_content_done', (message: any) => {
|
|
267
|
+
// openai.realtime.RealtimeContent
|
|
268
|
+
if (message.contentType === 'text') {
|
|
269
|
+
if (this.#textResponseRetries >= this.#maxTextResponseRetries) {
|
|
270
|
+
throw new Error(
|
|
271
|
+
'The OpenAI Realtime API returned a text response ' +
|
|
272
|
+
`after ${this.#maxTextResponseRetries} retries. ` +
|
|
273
|
+
'Please try to reduce the number of text system or ' +
|
|
274
|
+
'assistant messages in the chat context.',
|
|
275
|
+
);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
this.#textResponseRetries++;
|
|
279
|
+
this.#logger
|
|
280
|
+
.child({
|
|
281
|
+
itemId: message.itemId,
|
|
282
|
+
text: message.text,
|
|
283
|
+
retries: this.#textResponseRetries,
|
|
284
|
+
})
|
|
285
|
+
.warn(
|
|
286
|
+
'The OpenAI Realtime API returned a text response instead of audio. ' +
|
|
287
|
+
'Attempting to recover to audio mode...',
|
|
288
|
+
);
|
|
289
|
+
this.#session!.recoverFromTextResponse(message.itemId);
|
|
290
|
+
} else {
|
|
291
|
+
this.#textResponseRetries = 0;
|
|
292
|
+
}
|
|
293
|
+
});
|
|
294
|
+
|
|
255
295
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
256
296
|
this.#session.on('input_speech_committed', (ev: any) => {
|
|
257
297
|
// openai.realtime.InputSpeechCommittedEvent
|
|
@@ -284,6 +324,7 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
284
324
|
});
|
|
285
325
|
|
|
286
326
|
this.#session.on('input_speech_started', (ev: any) => {
|
|
327
|
+
this.emit('user_started_speaking');
|
|
287
328
|
if (this.#playingHandle && !this.#playingHandle.done) {
|
|
288
329
|
this.#playingHandle.interrupt();
|
|
289
330
|
|
|
@@ -326,6 +367,10 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
326
367
|
this.#updateState();
|
|
327
368
|
});
|
|
328
369
|
|
|
370
|
+
this.#session.on('metrics_collected', (metrics: MultimodalLLMMetrics) => {
|
|
371
|
+
this.emit('metrics_collected', metrics);
|
|
372
|
+
});
|
|
373
|
+
|
|
329
374
|
resolve(this.#session);
|
|
330
375
|
});
|
|
331
376
|
}
|
|
@@ -404,27 +449,16 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
404
449
|
};
|
|
405
450
|
this.subscribedTrack = track;
|
|
406
451
|
|
|
407
|
-
|
|
408
|
-
this.
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
this.readMicroTask = {
|
|
413
|
-
promise: new Promise<void>((resolve, reject) => {
|
|
414
|
-
cancel = () => {
|
|
415
|
-
reject(new Error('Task cancelled'));
|
|
416
|
-
};
|
|
417
|
-
readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))
|
|
418
|
-
.then(resolve)
|
|
419
|
-
.catch(reject);
|
|
420
|
-
}),
|
|
421
|
-
cancel: () => cancel(),
|
|
422
|
-
};
|
|
452
|
+
this.readMicroTask = new Promise<void>((resolve, reject) => {
|
|
453
|
+
readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))
|
|
454
|
+
.then(resolve)
|
|
455
|
+
.catch(reject);
|
|
456
|
+
});
|
|
423
457
|
}
|
|
424
458
|
|
|
425
459
|
#getLocalTrackSid(): string | null {
|
|
426
460
|
if (!this.#localTrackSid && this.room && this.room.localParticipant) {
|
|
427
|
-
this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant
|
|
461
|
+
this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant!.identity!);
|
|
428
462
|
}
|
|
429
463
|
return this.#localTrackSid;
|
|
430
464
|
}
|
|
@@ -475,7 +509,7 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
475
509
|
|
|
476
510
|
#setState(state: AgentState) {
|
|
477
511
|
if (this.room?.isConnected && this.room.localParticipant) {
|
|
478
|
-
const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
|
|
512
|
+
const currentState = this.room.localParticipant.attributes![AGENT_STATE_ATTRIBUTE];
|
|
479
513
|
if (currentState !== state) {
|
|
480
514
|
this.room.localParticipant.setAttributes({
|
|
481
515
|
[AGENT_STATE_ATTRIBUTE]: state,
|
package/src/pipeline/index.ts
CHANGED