@livekit/agents 1.0.24 → 1.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/llm.cjs +1 -2
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +1 -2
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +4 -4
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +0 -1
- package/dist/inference/tts.d.ts +0 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +4 -4
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +1 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/job.cjs +29 -2
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +6 -0
- package/dist/job.d.ts +6 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +19 -2
- package/dist/job.js.map +1 -1
- package/dist/llm/llm.cjs +2 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +1 -1
- package/dist/llm/llm.d.ts +1 -1
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +2 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +12 -4
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.cts +6 -1
- package/dist/stream/deferred_stream.d.ts +6 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +12 -4
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/deferred_stream.test.cjs +2 -2
- package/dist/stream/deferred_stream.test.cjs.map +1 -1
- package/dist/stream/deferred_stream.test.js +2 -2
- package/dist/stream/deferred_stream.test.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -8
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.cts +7 -3
- package/dist/stt/stream_adapter.d.ts +7 -3
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -8
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +8 -3
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +9 -3
- package/dist/stt/stt.d.ts +9 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +9 -4
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +23 -2
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +23 -2
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +10 -7
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +6 -3
- package/dist/tts/stream_adapter.d.ts +6 -3
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +10 -7
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +27 -16
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +12 -5
- package/dist/tts/tts.d.ts +12 -5
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +28 -17
- package/dist/tts/tts.js.map +1 -1
- package/dist/types.cjs +21 -32
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +41 -10
- package/dist/types.d.ts +41 -10
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +18 -30
- package/dist/types.js.map +1 -1
- package/dist/voice/agent.cjs +54 -19
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +54 -19
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +0 -3
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +0 -3
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +107 -27
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +16 -2
- package/dist/voice/agent_session.d.ts +16 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +110 -27
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +4 -4
- package/dist/voice/events.d.ts +4 -4
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +6 -7
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +7 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +16 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +8 -0
- package/dist/voice/io.d.ts +8 -0
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +16 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/index.cjs +23 -0
- package/dist/voice/recorder_io/index.cjs.map +1 -0
- package/dist/voice/recorder_io/index.d.cts +2 -0
- package/dist/voice/recorder_io/index.d.ts +2 -0
- package/dist/voice/recorder_io/index.d.ts.map +1 -0
- package/dist/voice/recorder_io/index.js +2 -0
- package/dist/voice/recorder_io/index.js.map +1 -0
- package/dist/voice/recorder_io/recorder_io.cjs +542 -0
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
- package/dist/voice/recorder_io/recorder_io.d.cts +100 -0
- package/dist/voice/recorder_io/recorder_io.d.ts +100 -0
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
- package/dist/voice/recorder_io/recorder_io.js +508 -0
- package/dist/voice/recorder_io/recorder_io.js.map +1 -0
- package/dist/voice/report.cjs +7 -2
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +11 -1
- package/dist/voice/report.d.ts +11 -1
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +7 -2
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +2 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +2 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +8 -7
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +2 -1
- package/dist/voice/room_io/_output.d.ts +2 -1
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +8 -7
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/worker.cjs +4 -3
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.js +4 -3
- package/dist/worker.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/llm.ts +0 -1
- package/src/inference/stt.ts +1 -2
- package/src/inference/tts.ts +5 -4
- package/src/ipc/job_proc_lazy_main.ts +1 -1
- package/src/job.ts +21 -2
- package/src/llm/llm.ts +2 -2
- package/src/stream/deferred_stream.test.ts +3 -3
- package/src/stream/deferred_stream.ts +22 -5
- package/src/stt/stream_adapter.ts +18 -8
- package/src/stt/stt.ts +19 -6
- package/src/telemetry/traces.ts +25 -3
- package/src/tts/stream_adapter.ts +15 -7
- package/src/tts/tts.ts +46 -21
- package/src/types.ts +57 -33
- package/src/voice/agent.ts +59 -19
- package/src/voice/agent_activity.ts +0 -3
- package/src/voice/agent_session.ts +142 -35
- package/src/voice/events.ts +6 -3
- package/src/voice/generation.ts +10 -8
- package/src/voice/io.ts +19 -0
- package/src/voice/recorder_io/index.ts +4 -0
- package/src/voice/recorder_io/recorder_io.ts +690 -0
- package/src/voice/report.ts +20 -3
- package/src/voice/room_io/_input.ts +2 -1
- package/src/voice/room_io/_output.ts +10 -7
- package/src/worker.ts +1 -1
package/src/voice/agent.ts
CHANGED
|
@@ -260,27 +260,41 @@ export class Agent<UserData = any> {
|
|
|
260
260
|
let wrapped_stt = activity.stt;
|
|
261
261
|
|
|
262
262
|
if (!wrapped_stt.capabilities.streaming) {
|
|
263
|
-
|
|
263
|
+
const vad = agent.vad || activity.vad;
|
|
264
|
+
if (!vad) {
|
|
264
265
|
throw new Error(
|
|
265
266
|
'STT does not support streaming, add a VAD to the AgentTask/VoiceAgent to enable streaming',
|
|
266
267
|
);
|
|
267
268
|
}
|
|
268
|
-
wrapped_stt = new STTStreamAdapter(wrapped_stt,
|
|
269
|
+
wrapped_stt = new STTStreamAdapter(wrapped_stt, vad);
|
|
269
270
|
}
|
|
270
271
|
|
|
271
|
-
const
|
|
272
|
+
const connOptions = activity.agentSession.connOptions.sttConnOptions;
|
|
273
|
+
const stream = wrapped_stt.stream({ connOptions });
|
|
272
274
|
stream.updateInputStream(audio);
|
|
273
275
|
|
|
276
|
+
let cleaned = false;
|
|
277
|
+
const cleanup = () => {
|
|
278
|
+
if (cleaned) return;
|
|
279
|
+
cleaned = true;
|
|
280
|
+
stream.detachInputStream();
|
|
281
|
+
stream.close();
|
|
282
|
+
};
|
|
283
|
+
|
|
274
284
|
return new ReadableStream({
|
|
275
285
|
async start(controller) {
|
|
276
|
-
|
|
277
|
-
|
|
286
|
+
try {
|
|
287
|
+
for await (const event of stream) {
|
|
288
|
+
controller.enqueue(event);
|
|
289
|
+
}
|
|
290
|
+
controller.close();
|
|
291
|
+
} finally {
|
|
292
|
+
// Always clean up the STT stream, whether it ends naturally or is cancelled
|
|
293
|
+
cleanup();
|
|
278
294
|
}
|
|
279
|
-
controller.close();
|
|
280
295
|
},
|
|
281
296
|
cancel() {
|
|
282
|
-
|
|
283
|
-
stream.close();
|
|
297
|
+
cleanup();
|
|
284
298
|
},
|
|
285
299
|
});
|
|
286
300
|
},
|
|
@@ -304,22 +318,36 @@ export class Agent<UserData = any> {
|
|
|
304
318
|
|
|
305
319
|
// TODO(brian): make parallelToolCalls configurable
|
|
306
320
|
const { toolChoice } = modelSettings;
|
|
321
|
+
const connOptions = activity.agentSession.connOptions.llmConnOptions;
|
|
307
322
|
|
|
308
323
|
const stream = activity.llm.chat({
|
|
309
324
|
chatCtx,
|
|
310
325
|
toolCtx,
|
|
311
326
|
toolChoice,
|
|
327
|
+
connOptions,
|
|
312
328
|
parallelToolCalls: true,
|
|
313
329
|
});
|
|
330
|
+
|
|
331
|
+
let cleaned = false;
|
|
332
|
+
const cleanup = () => {
|
|
333
|
+
if (cleaned) return;
|
|
334
|
+
cleaned = true;
|
|
335
|
+
stream.close();
|
|
336
|
+
};
|
|
337
|
+
|
|
314
338
|
return new ReadableStream({
|
|
315
339
|
async start(controller) {
|
|
316
|
-
|
|
317
|
-
|
|
340
|
+
try {
|
|
341
|
+
for await (const chunk of stream) {
|
|
342
|
+
controller.enqueue(chunk);
|
|
343
|
+
}
|
|
344
|
+
controller.close();
|
|
345
|
+
} finally {
|
|
346
|
+
cleanup();
|
|
318
347
|
}
|
|
319
|
-
controller.close();
|
|
320
348
|
},
|
|
321
349
|
cancel() {
|
|
322
|
-
|
|
350
|
+
cleanup();
|
|
323
351
|
},
|
|
324
352
|
});
|
|
325
353
|
},
|
|
@@ -340,21 +368,33 @@ export class Agent<UserData = any> {
|
|
|
340
368
|
wrapped_tts = new TTSStreamAdapter(wrapped_tts, new BasicSentenceTokenizer());
|
|
341
369
|
}
|
|
342
370
|
|
|
343
|
-
const
|
|
371
|
+
const connOptions = activity.agentSession.connOptions.ttsConnOptions;
|
|
372
|
+
const stream = wrapped_tts.stream({ connOptions });
|
|
344
373
|
stream.updateInputStream(text);
|
|
345
374
|
|
|
375
|
+
let cleaned = false;
|
|
376
|
+
const cleanup = () => {
|
|
377
|
+
if (cleaned) return;
|
|
378
|
+
cleaned = true;
|
|
379
|
+
stream.close();
|
|
380
|
+
};
|
|
381
|
+
|
|
346
382
|
return new ReadableStream({
|
|
347
383
|
async start(controller) {
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
384
|
+
try {
|
|
385
|
+
for await (const chunk of stream) {
|
|
386
|
+
if (chunk === SynthesizeStream.END_OF_STREAM) {
|
|
387
|
+
break;
|
|
388
|
+
}
|
|
389
|
+
controller.enqueue(chunk.frame);
|
|
351
390
|
}
|
|
352
|
-
controller.
|
|
391
|
+
controller.close();
|
|
392
|
+
} finally {
|
|
393
|
+
cleanup();
|
|
353
394
|
}
|
|
354
|
-
controller.close();
|
|
355
395
|
},
|
|
356
396
|
cancel() {
|
|
357
|
-
|
|
397
|
+
cleanup();
|
|
358
398
|
},
|
|
359
399
|
});
|
|
360
400
|
},
|
|
@@ -2259,15 +2259,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2259
2259
|
}
|
|
2260
2260
|
if (this.stt instanceof STT) {
|
|
2261
2261
|
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2262
|
-
await this.stt.close();
|
|
2263
2262
|
}
|
|
2264
2263
|
if (this.tts instanceof TTS) {
|
|
2265
2264
|
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
2266
|
-
await this.tts.close();
|
|
2267
2265
|
}
|
|
2268
2266
|
if (this.vad instanceof VAD) {
|
|
2269
2267
|
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
2270
|
-
await this.vad.close();
|
|
2271
2268
|
}
|
|
2272
2269
|
|
|
2273
2270
|
this.detachAudioInput();
|
|
@@ -15,7 +15,7 @@ import {
|
|
|
15
15
|
type STTModelString,
|
|
16
16
|
type TTSModelString,
|
|
17
17
|
} from '../inference/index.js';
|
|
18
|
-
import { getJobContext } from '../job.js';
|
|
18
|
+
import { type JobContext, getJobContext } from '../job.js';
|
|
19
19
|
import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js';
|
|
20
20
|
import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js';
|
|
21
21
|
import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
|
|
@@ -25,6 +25,12 @@ import type { STT } from '../stt/index.js';
|
|
|
25
25
|
import type { STTError } from '../stt/stt.js';
|
|
26
26
|
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
27
27
|
import type { TTS, TTSError } from '../tts/tts.js';
|
|
28
|
+
import {
|
|
29
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
|
30
|
+
DEFAULT_SESSION_CONNECT_OPTIONS,
|
|
31
|
+
type ResolvedSessionConnectOptions,
|
|
32
|
+
type SessionConnectOptions,
|
|
33
|
+
} from '../types.js';
|
|
28
34
|
import type { VAD } from '../vad.js';
|
|
29
35
|
import type { Agent } from './agent.js';
|
|
30
36
|
import { AgentActivity } from './agent_activity.js';
|
|
@@ -40,6 +46,7 @@ import {
|
|
|
40
46
|
type ErrorEvent,
|
|
41
47
|
type FunctionToolsExecutedEvent,
|
|
42
48
|
type MetricsCollectedEvent,
|
|
49
|
+
type ShutdownReason,
|
|
43
50
|
type SpeechCreatedEvent,
|
|
44
51
|
type UserInputTranscribedEvent,
|
|
45
52
|
type UserState,
|
|
@@ -50,6 +57,7 @@ import {
|
|
|
50
57
|
createUserStateChangedEvent,
|
|
51
58
|
} from './events.js';
|
|
52
59
|
import { AgentInput, AgentOutput } from './io.js';
|
|
60
|
+
import { RecorderIO } from './recorder_io/index.js';
|
|
53
61
|
import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io/index.js';
|
|
54
62
|
import type { UnknownUserData } from './run_context.js';
|
|
55
63
|
import type { SpeechHandle } from './speech_handle.js';
|
|
@@ -100,6 +108,7 @@ export type AgentSessionOptions<UserData = UnknownUserData> = {
|
|
|
100
108
|
tts?: TTS | TTSModelString;
|
|
101
109
|
userData?: UserData;
|
|
102
110
|
voiceOptions?: Partial<VoiceOptions>;
|
|
111
|
+
connOptions?: SessionConnectOptions;
|
|
103
112
|
};
|
|
104
113
|
|
|
105
114
|
export class AgentSession<
|
|
@@ -132,10 +141,20 @@ export class AgentSession<
|
|
|
132
141
|
private closingTask: Promise<void> | null = null;
|
|
133
142
|
private userAwayTimer: NodeJS.Timeout | null = null;
|
|
134
143
|
|
|
144
|
+
// Connection options for STT, LLM, and TTS
|
|
145
|
+
private _connOptions: ResolvedSessionConnectOptions;
|
|
146
|
+
|
|
147
|
+
// Unrecoverable error counts, reset after agent speaking
|
|
148
|
+
private llmErrorCounts = 0;
|
|
149
|
+
private ttsErrorCounts = 0;
|
|
150
|
+
|
|
135
151
|
private sessionSpan?: Span;
|
|
136
152
|
private userSpeakingSpan?: Span;
|
|
137
153
|
private agentSpeakingSpan?: Span;
|
|
138
154
|
|
|
155
|
+
/** @internal */
|
|
156
|
+
_recorderIO?: RecorderIO;
|
|
157
|
+
|
|
139
158
|
/** @internal */
|
|
140
159
|
rootSpanContext?: Context;
|
|
141
160
|
|
|
@@ -159,8 +178,19 @@ export class AgentSession<
|
|
|
159
178
|
turnDetection,
|
|
160
179
|
userData,
|
|
161
180
|
voiceOptions = defaultVoiceOptions,
|
|
181
|
+
connOptions,
|
|
162
182
|
} = opts;
|
|
163
183
|
|
|
184
|
+
// Merge user-provided connOptions with defaults
|
|
185
|
+
this._connOptions = {
|
|
186
|
+
sttConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.sttConnOptions },
|
|
187
|
+
llmConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.llmConnOptions },
|
|
188
|
+
ttsConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.ttsConnOptions },
|
|
189
|
+
maxUnrecoverableErrors:
|
|
190
|
+
connOptions?.maxUnrecoverableErrors ??
|
|
191
|
+
DEFAULT_SESSION_CONNECT_OPTIONS.maxUnrecoverableErrors,
|
|
192
|
+
};
|
|
193
|
+
|
|
164
194
|
this.vad = vad;
|
|
165
195
|
|
|
166
196
|
if (typeof stt === 'string') {
|
|
@@ -225,6 +255,11 @@ export class AgentSession<
|
|
|
225
255
|
return this._chatCtx;
|
|
226
256
|
}
|
|
227
257
|
|
|
258
|
+
/** Connection options for STT, LLM, and TTS. */
|
|
259
|
+
get connOptions(): ResolvedSessionConnectOptions {
|
|
260
|
+
return this._connOptions;
|
|
261
|
+
}
|
|
262
|
+
|
|
228
263
|
set userData(value: UserData) {
|
|
229
264
|
this._userData = value;
|
|
230
265
|
}
|
|
@@ -234,14 +269,12 @@ export class AgentSession<
|
|
|
234
269
|
room,
|
|
235
270
|
inputOptions,
|
|
236
271
|
outputOptions,
|
|
237
|
-
record,
|
|
238
272
|
span,
|
|
239
273
|
}: {
|
|
240
274
|
agent: Agent;
|
|
241
275
|
room: Room;
|
|
242
276
|
inputOptions?: Partial<RoomInputOptions>;
|
|
243
277
|
outputOptions?: Partial<RoomOutputOptions>;
|
|
244
|
-
record: boolean;
|
|
245
278
|
span: Span;
|
|
246
279
|
}): Promise<void> {
|
|
247
280
|
span.setAttribute(traceTypes.ATTR_AGENT_LABEL, agent.id);
|
|
@@ -275,20 +308,39 @@ export class AgentSession<
|
|
|
275
308
|
});
|
|
276
309
|
this.roomIO.start();
|
|
277
310
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
311
|
+
let ctx: JobContext | undefined = undefined;
|
|
312
|
+
try {
|
|
313
|
+
ctx = getJobContext();
|
|
314
|
+
} catch (error) {
|
|
315
|
+
// JobContext is not available in evals
|
|
316
|
+
this.logger.warn('JobContext is not available');
|
|
282
317
|
}
|
|
283
318
|
|
|
284
|
-
if (
|
|
319
|
+
if (ctx) {
|
|
320
|
+
if (ctx.room === room && !room.isConnected) {
|
|
321
|
+
this.logger.debug('Auto-connecting to room via job context');
|
|
322
|
+
tasks.push(ctx.connect());
|
|
323
|
+
}
|
|
324
|
+
|
|
285
325
|
if (ctx._primaryAgentSession === undefined) {
|
|
286
326
|
ctx._primaryAgentSession = this;
|
|
287
|
-
} else {
|
|
327
|
+
} else if (this._enableRecording) {
|
|
288
328
|
throw new Error(
|
|
289
|
-
'Only one `AgentSession` can be the primary at a time. If you want to ignore primary designation, use session.start(record
|
|
329
|
+
'Only one `AgentSession` can be the primary at a time. If you want to ignore primary designation, use `session.start({ record: false })`.',
|
|
290
330
|
);
|
|
291
331
|
}
|
|
332
|
+
|
|
333
|
+
if (this.input.audio && this.output.audio && this._enableRecording) {
|
|
334
|
+
this._recorderIO = new RecorderIO({ agentSession: this });
|
|
335
|
+
this.input.audio = this._recorderIO.recordInput(this.input.audio);
|
|
336
|
+
this.output.audio = this._recorderIO.recordOutput(this.output.audio);
|
|
337
|
+
|
|
338
|
+
// Start recording to session directory
|
|
339
|
+
const sessionDir = ctx.sessionDirectory;
|
|
340
|
+
if (sessionDir) {
|
|
341
|
+
tasks.push(this._recorderIO.start(`${sessionDir}/audio.ogg`));
|
|
342
|
+
}
|
|
343
|
+
}
|
|
292
344
|
}
|
|
293
345
|
|
|
294
346
|
// TODO(AJS-265): add shutdown callback to job context
|
|
@@ -327,29 +379,29 @@ export class AgentSession<
|
|
|
327
379
|
return;
|
|
328
380
|
}
|
|
329
381
|
|
|
330
|
-
|
|
382
|
+
let ctx: JobContext | undefined = undefined;
|
|
383
|
+
try {
|
|
384
|
+
ctx = getJobContext();
|
|
331
385
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
);
|
|
386
|
+
if (record === undefined) {
|
|
387
|
+
record = ctx.job.enableRecording;
|
|
388
|
+
}
|
|
336
389
|
|
|
337
|
-
|
|
338
|
-
this._enableRecording = record;
|
|
390
|
+
this._enableRecording = record;
|
|
339
391
|
|
|
340
|
-
|
|
341
|
-
|
|
392
|
+
if (this._enableRecording) {
|
|
393
|
+
ctx.initRecording();
|
|
394
|
+
}
|
|
395
|
+
} catch (error) {
|
|
396
|
+
// JobContext is not available in evals
|
|
397
|
+
this.logger.warn('JobContext is not available');
|
|
342
398
|
}
|
|
343
399
|
|
|
344
|
-
// Create agent_session as a ROOT span (new trace) to match Python behavior
|
|
345
|
-
// This creates a separate trace for better cloud dashboard organization
|
|
346
400
|
this.sessionSpan = tracer.startSpan({
|
|
347
401
|
name: 'agent_session',
|
|
348
402
|
context: ROOT_CONTEXT,
|
|
349
403
|
});
|
|
350
404
|
|
|
351
|
-
// Set the session span as the active span in the context
|
|
352
|
-
// This ensures all child spans (agent_turn, user_turn, etc.) are parented to it
|
|
353
405
|
this.rootSpanContext = trace.setSpan(ROOT_CONTEXT, this.sessionSpan);
|
|
354
406
|
|
|
355
407
|
await this._startImpl({
|
|
@@ -357,7 +409,6 @@ export class AgentSession<
|
|
|
357
409
|
room,
|
|
358
410
|
inputOptions,
|
|
359
411
|
outputOptions,
|
|
360
|
-
record,
|
|
361
412
|
span: this.sessionSpan,
|
|
362
413
|
});
|
|
363
414
|
}
|
|
@@ -397,7 +448,17 @@ export class AgentSession<
|
|
|
397
448
|
throw new Error('AgentSession is not running');
|
|
398
449
|
}
|
|
399
450
|
|
|
400
|
-
|
|
451
|
+
const doSay = (activity: AgentActivity) => {
|
|
452
|
+
return activity.say(text, options);
|
|
453
|
+
};
|
|
454
|
+
|
|
455
|
+
// attach to the session span if called outside of the AgentSession
|
|
456
|
+
const activeSpan = trace.getActiveSpan();
|
|
457
|
+
if (!activeSpan && this.rootSpanContext) {
|
|
458
|
+
return otelContext.with(this.rootSpanContext, () => doSay(this.activity!));
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
return doSay(this.activity);
|
|
401
462
|
}
|
|
402
463
|
|
|
403
464
|
interrupt() {
|
|
@@ -424,14 +485,25 @@ export class AgentSession<
|
|
|
424
485
|
})
|
|
425
486
|
: undefined;
|
|
426
487
|
|
|
427
|
-
|
|
428
|
-
if (
|
|
429
|
-
|
|
488
|
+
const doGenerateReply = (activity: AgentActivity, nextActivity?: AgentActivity) => {
|
|
489
|
+
if (activity.draining) {
|
|
490
|
+
if (!nextActivity) {
|
|
491
|
+
throw new Error('AgentSession is closing, cannot use generateReply()');
|
|
492
|
+
}
|
|
493
|
+
return nextActivity.generateReply({ userMessage, ...options });
|
|
430
494
|
}
|
|
431
|
-
return
|
|
495
|
+
return activity.generateReply({ userMessage, ...options });
|
|
496
|
+
};
|
|
497
|
+
|
|
498
|
+
// attach to the session span if called outside of the AgentSession
|
|
499
|
+
const activeSpan = trace.getActiveSpan();
|
|
500
|
+
if (!activeSpan && this.rootSpanContext) {
|
|
501
|
+
return otelContext.with(this.rootSpanContext, () =>
|
|
502
|
+
doGenerateReply(this.activity!, this.nextActivity),
|
|
503
|
+
);
|
|
432
504
|
}
|
|
433
505
|
|
|
434
|
-
return this.activity.
|
|
506
|
+
return doGenerateReply(this.activity!, this.nextActivity);
|
|
435
507
|
}
|
|
436
508
|
|
|
437
509
|
private async updateActivity(agent: Agent): Promise<void> {
|
|
@@ -492,13 +564,22 @@ export class AgentSession<
|
|
|
492
564
|
await this.closeImpl(CloseReason.USER_INITIATED);
|
|
493
565
|
}
|
|
494
566
|
|
|
567
|
+
shutdown(options?: { drain?: boolean; reason?: ShutdownReason }): void {
|
|
568
|
+
const { drain = true, reason = CloseReason.USER_INITIATED } = options ?? {};
|
|
569
|
+
|
|
570
|
+
this._closeSoon({
|
|
571
|
+
reason,
|
|
572
|
+
drain,
|
|
573
|
+
});
|
|
574
|
+
}
|
|
575
|
+
|
|
495
576
|
/** @internal */
|
|
496
577
|
_closeSoon({
|
|
497
578
|
reason,
|
|
498
579
|
drain = false,
|
|
499
580
|
error = null,
|
|
500
581
|
}: {
|
|
501
|
-
reason:
|
|
582
|
+
reason: ShutdownReason;
|
|
502
583
|
drain?: boolean;
|
|
503
584
|
error?: RealtimeModelError | STTError | TTSError | LLMError | null;
|
|
504
585
|
}): void {
|
|
@@ -514,6 +595,19 @@ export class AgentSession<
|
|
|
514
595
|
return;
|
|
515
596
|
}
|
|
516
597
|
|
|
598
|
+
// Track error counts per type to implement max_unrecoverable_errors logic
|
|
599
|
+
if (error.type === 'llm_error') {
|
|
600
|
+
this.llmErrorCounts += 1;
|
|
601
|
+
if (this.llmErrorCounts <= this._connOptions.maxUnrecoverableErrors) {
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
604
|
+
} else if (error.type === 'tts_error') {
|
|
605
|
+
this.ttsErrorCounts += 1;
|
|
606
|
+
if (this.ttsErrorCounts <= this._connOptions.maxUnrecoverableErrors) {
|
|
607
|
+
return;
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
|
|
517
611
|
this.logger.error(error, 'AgentSession is closing due to unrecoverable error');
|
|
518
612
|
|
|
519
613
|
this.closingTask = (async () => {
|
|
@@ -541,7 +635,9 @@ export class AgentSession<
|
|
|
541
635
|
}
|
|
542
636
|
|
|
543
637
|
if (state === 'speaking') {
|
|
544
|
-
//
|
|
638
|
+
// Reset error counts when agent starts speaking
|
|
639
|
+
this.llmErrorCounts = 0;
|
|
640
|
+
this.ttsErrorCounts = 0;
|
|
545
641
|
|
|
546
642
|
if (this.agentSpeakingSpan === undefined) {
|
|
547
643
|
this.agentSpeakingSpan = tracer.startSpan({
|
|
@@ -657,7 +753,7 @@ export class AgentSession<
|
|
|
657
753
|
}
|
|
658
754
|
|
|
659
755
|
private async closeImpl(
|
|
660
|
-
reason:
|
|
756
|
+
reason: ShutdownReason,
|
|
661
757
|
error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
|
|
662
758
|
drain: boolean = false,
|
|
663
759
|
): Promise<void> {
|
|
@@ -671,7 +767,7 @@ export class AgentSession<
|
|
|
671
767
|
}
|
|
672
768
|
|
|
673
769
|
private async closeImplInner(
|
|
674
|
-
reason:
|
|
770
|
+
reason: ShutdownReason,
|
|
675
771
|
error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
|
|
676
772
|
drain: boolean = false,
|
|
677
773
|
): Promise<void> {
|
|
@@ -694,7 +790,16 @@ export class AgentSession<
|
|
|
694
790
|
await this.activity.drain();
|
|
695
791
|
// wait any uninterruptible speech to finish
|
|
696
792
|
await this.activity.currentSpeech?.waitForPlayout();
|
|
697
|
-
|
|
793
|
+
try {
|
|
794
|
+
this.activity.detachAudioInput();
|
|
795
|
+
} catch (error) {
|
|
796
|
+
// Ignore detach errors during cleanup - source may not have been set
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
// Close recorder before detaching inputs/outputs (keep reference for session report)
|
|
801
|
+
if (this._recorderIO) {
|
|
802
|
+
await this._recorderIO.close();
|
|
698
803
|
}
|
|
699
804
|
|
|
700
805
|
// detach the inputs and outputs
|
|
@@ -730,6 +835,8 @@ export class AgentSession<
|
|
|
730
835
|
this.userState = 'listening';
|
|
731
836
|
this._agentState = 'initializing';
|
|
732
837
|
this.rootSpanContext = undefined;
|
|
838
|
+
this.llmErrorCounts = 0;
|
|
839
|
+
this.ttsErrorCounts = 0;
|
|
733
840
|
|
|
734
841
|
this.logger.info({ reason, error }, 'AgentSession closed');
|
|
735
842
|
}
|
package/src/voice/events.ts
CHANGED
|
@@ -5,9 +5,10 @@ import type {
|
|
|
5
5
|
ChatMessage,
|
|
6
6
|
FunctionCall,
|
|
7
7
|
FunctionCallOutput,
|
|
8
|
+
LLM,
|
|
9
|
+
RealtimeModel,
|
|
8
10
|
RealtimeModelError,
|
|
9
11
|
} from '../llm/index.js';
|
|
10
|
-
import type { LLM, RealtimeModel } from '../llm/index.js';
|
|
11
12
|
import type { LLMError } from '../llm/llm.js';
|
|
12
13
|
import type { AgentMetrics } from '../metrics/base.js';
|
|
13
14
|
import type { STT } from '../stt/index.js';
|
|
@@ -38,6 +39,8 @@ export enum CloseReason {
|
|
|
38
39
|
USER_INITIATED = 'user_initiated',
|
|
39
40
|
}
|
|
40
41
|
|
|
42
|
+
export type ShutdownReason = CloseReason | string;
|
|
43
|
+
|
|
41
44
|
export type SpeechSource = 'say' | 'generate_reply' | 'tool_response';
|
|
42
45
|
|
|
43
46
|
export type UserStateChangedEvent = {
|
|
@@ -231,12 +234,12 @@ export const createErrorEvent = (
|
|
|
231
234
|
export type CloseEvent = {
|
|
232
235
|
type: 'close';
|
|
233
236
|
error: RealtimeModelError | STTError | TTSError | LLMError | null;
|
|
234
|
-
reason:
|
|
237
|
+
reason: ShutdownReason;
|
|
235
238
|
createdAt: number;
|
|
236
239
|
};
|
|
237
240
|
|
|
238
241
|
export const createCloseEvent = (
|
|
239
|
-
reason:
|
|
242
|
+
reason: ShutdownReason,
|
|
240
243
|
error: RealtimeModelError | STTError | TTSError | LLMError | null = null,
|
|
241
244
|
createdAt: number = Date.now(),
|
|
242
245
|
): CloseEvent => ({
|
package/src/voice/generation.ts
CHANGED
|
@@ -24,7 +24,7 @@ import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
|
|
|
24
24
|
import { log } from '../log.js';
|
|
25
25
|
import { IdentityTransform } from '../stream/identity_transform.js';
|
|
26
26
|
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
27
|
-
import { Future, Task, shortuuid, toError } from '../utils.js';
|
|
27
|
+
import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
|
|
28
28
|
import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
|
|
29
29
|
import type { AgentSession } from './agent_session.js';
|
|
30
30
|
import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js';
|
|
@@ -411,17 +411,19 @@ export function performLLMInference(
|
|
|
411
411
|
return;
|
|
412
412
|
}
|
|
413
413
|
|
|
414
|
+
const abortPromise = waitForAbort(signal);
|
|
415
|
+
|
|
414
416
|
// TODO(brian): add support for dynamic tools
|
|
415
417
|
|
|
416
418
|
llmStreamReader = llmStream.getReader();
|
|
417
419
|
while (true) {
|
|
418
|
-
if (signal.aborted)
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
420
|
+
if (signal.aborted) break;
|
|
421
|
+
|
|
422
|
+
const result = await Promise.race([llmStreamReader.read(), abortPromise]);
|
|
423
|
+
if (result === undefined) break;
|
|
424
|
+
|
|
425
|
+
const { done, value: chunk } = result;
|
|
426
|
+
if (done) break;
|
|
425
427
|
|
|
426
428
|
if (typeof chunk === 'string') {
|
|
427
429
|
data.generatedText += chunk;
|
package/src/voice/io.ts
CHANGED
|
@@ -28,6 +28,7 @@ export type TTSNode = (
|
|
|
28
28
|
text: ReadableStream<string>,
|
|
29
29
|
modelSettings: ModelSettings,
|
|
30
30
|
) => Promise<ReadableStream<AudioFrame> | null>;
|
|
31
|
+
|
|
31
32
|
export abstract class AudioInput {
|
|
32
33
|
protected deferredStream: DeferredReadableStream<AudioFrame> =
|
|
33
34
|
new DeferredReadableStream<AudioFrame>();
|
|
@@ -128,6 +129,24 @@ export abstract class AudioOutput extends EventEmitter {
|
|
|
128
129
|
this.nextInChain.onDetached();
|
|
129
130
|
}
|
|
130
131
|
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Pause the audio playback
|
|
135
|
+
*/
|
|
136
|
+
pause(): void {
|
|
137
|
+
if (this.nextInChain) {
|
|
138
|
+
this.nextInChain.pause();
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Resume the audio playback
|
|
144
|
+
*/
|
|
145
|
+
resume(): void {
|
|
146
|
+
if (this.nextInChain) {
|
|
147
|
+
this.nextInChain.resume();
|
|
148
|
+
}
|
|
149
|
+
}
|
|
131
150
|
}
|
|
132
151
|
|
|
133
152
|
export interface PlaybackFinishedEvent {
|